Source code for dynamo.external.scifate

# integrate with Scribe (Qiu, et. al, Cell Systems, 2020) next
# the following code is based on Cao, et. al, Nature Biotechnology, 2020 and
# https://github.com/JunyueC/sci-fate_analysis

from typing import Dict, Union

import numpy as np
import pandas as pd
from anndata import AnnData
from anndata.utils import make_index_unique
from scipy.sparse import issparse
from tqdm import tqdm

from ..tools.utils import einsum_correlation
from .utils import TF_link_gene_chip, normalize_data


[docs]def scifate_glmnet(
    adata: AnnData,
    gene_filter_rate: float = 0.1,
    cell_filter_UMI: int = 10000,
    core_n_lasso: int = 1,
    core_n_filtering: int = 1,
    motif_ref: str = "https://www.dropbox.com/s/s8em539ojl55kgf/motifAnnotations_hgnc.csv?dl=1",
    TF_link_ENCODE_ref: str = "https://www.dropbox.com/s/bjuope41pte7mf4/df_gene_TF_link_ENCODE.csv?dl=1",
    nt_layers: list = ["X_new", "X_total"],
) -> AnnData:
    """Perform scifate analysis using glmnet.

     Reconstruction of regulatory network (Cao, et. al, Nature Biotechnology, 2020) from TFs to other target
     genes via LASSO regression between the total expression of known transcription factors and the newly synthesized
     RNA of potential targets. The inferred regulatory relationships between TF and targets are further filtered based
     on evidence of promoter motifs (not implemented currently) and the ENCODE chip-seq peaks. The python wrapper for the
     glmnet FORTRON code, glm-python (https://github.com/bbalasub1/glmnet_python) was used. More details on lasso regression
     with glm-python can be found here (https://github.com/bbalasub1/glmnet_python/blob/master/test/glmnet_examples.ipynb).
     Note that this function can be applied to both of the metabolic labeling single-cell assays with newly synthesized
     and total RNA as well as the regular single cell assays with both the unspliced and spliced transcripts. Furthermore,
     you can also replace either the new or unspliced RNA with dynamo estimated cell-wise velocity, transcription,
     splicing and degradation rates for each gene (similarly, replacing the expression values of transcription factors with RNA binding,
     ribosome, epigenetics or epitranscriptomic factors, etc.) to infer the tottal regulatory effects, transcription, splicing
     and post-transcriptional regulation of different factors. In addition, this approach will be fully integrated with
     Scribe (Qiu, et. al, 2020) which employs restricted directed information to determine causality by estimating the
     strength of information transferred from a potential regulator to its downstream target. In contrast, of lasso regression,
     Scribe can learn both linear and non-linear causality in deterministic and stochastic systems. It also incorporates
     rigorous procedures to alleviate sampling bias and builds upon novel estimators and regularization techniques to
     facilitate inference of large-scale causal networks.

    Args:
        adata: Adata object that includes both newly synthesized and total gene expression of cells. Alternatively,
            the object should include both unspliced and spliced gene expression of cells.
        gene_filter_rate: Minimum percentage of expressed cells for gene filtering.
        cell_filter_UMI: Minimum number of UMIs for cell filtering.
        core_n_lasso: Number of cores for lasso regression in linkage analysis. By default, it is 1 and parallel is
            turned off. Parallel computing can significantly speed up the computation process, especially for datasets
            involve many cells or genes. But for smaller datasets or genes, it could result in a reduction in speed due
            to the additional overhead. User discretion is advised.
        core_n_filtering: Number of cores for filtering TF-gene links. Not used currently.
        motif_ref: The path to the TF binding motif data as described above. It provides the list of TFs gene names and
            is used to process adata object to generate the TF expression and target new expression matrix for glmnet
            based TF-target synthesis rate linkage analysis. But currently it is not used for motif based filtering.
            By default, it is a dropbox link that store the data from us. Other motif reference can bed downloaded from
            RcisTarget: https://resources.aertslab.org/cistarget/. For human motif matrix, it can be downloaded from
            June's shared folder:
                https://shendure-web.gs.washington.edu/content/members/cao1025/public/nobackup/sci_fate/data/hg19-tss-centered-10kb-7species.mc9nr.feather
        TF_link_ENCODE_ref: The path to the TF chip-seq data. By default, it is a dropbox link from us that stores the
            data. Other data can be downloaded from:
                https://amp.pharm.mssm.edu/Harmonizome/dataset/ENCODE+Transcription+Factor+Targets.
        nt_layers: The layers that will be used for the network inference. Note that the layers can be changed flexibly.
            See the description of this function above.

        Note that if your internet connection is slow, we recommend to download the `motif_ref` and `TF_link_ENCODE_ref` and
        supplies those two arguments with the local paths where the downloaded datasets are saved.

    Returns:
        An updated adata object with a new key `scifate` in .uns attribute, which stores the raw lasso regression results
        and the filter results after applying the Fisher exact test of the ChIP-seq peaks.
    """

    try:
        import glmnet_python

        global cvglmnet, cvglmnetCoef
        from glmnet_python import cvglmnet, cvglmnetCoef
    except ImportError:
        raise ImportError(
            "You need to install the package `glmnet_python`."
            "The original version https://github.com/bbalasub1/glmnet_python is not updated."
            "Plelease install johnlees's fork https://github.com/johnlees/glmnet_python."
            "Also check this pull request for more details: "
            "https://github.com/bbalasub1/glmnet_python/pull/47"
        )

    df_gene_TF_link_ENCODE = pd.read_csv(TF_link_ENCODE_ref, sep="\t")
    motifAnnotations_hgnc = pd.read_csv(motif_ref, sep="\t")
    TF_list = motifAnnotations_hgnc.loc[:, "TF"]

    _, new_mat, obs, var, var_TF, TF_matrix = adata_processing_TF_link(
        adata, nt_layers, TF_list, gene_filter_rate, cell_filter_UMI
    )

    # Apply LASSO for TF-gene linkage analysis
    new_size_factor = obs.loc[:, "Size_Factor"]
    new_size_factor = (new_size_factor - np.mean(new_size_factor)) / np.std(new_size_factor)

    TF_matrix.loc["new_size_factor"] = new_size_factor
    labeling_rate = obs["labeling_rate"]
    if np.std(labeling_rate) != 0:
        labeling_ratio = (labeling_rate - np.mean(labeling_rate)) / np.std(labeling_rate)
        TF_matrix.loc["labeling_ratio", :] = labeling_ratio

    # link TF and genes based on covariance
    link_result = link_TF_gene_analysis(TF_matrix, new_mat, var_TF, core_num=core_n_lasso)
    link_result = pd.concat([i if type(i) != str else None for i in link_result], axis=0)

    # filtering the links using TF-gene binding data and store the result in the target folder
    # note that currently the motif filtering is not implement
    df_gene_TF_link_chip = TF_gene_filter_links(link_result, var, df_gene_TF_link_ENCODE)

    adata.uns["scifate"] = {
        "glmnet_res": link_result,
        "glmnet_chip_filter": df_gene_TF_link_chip,
    }

    return adata


def adata_processing_TF_link(
    adata: AnnData,
    nt_layers: list,
    TF_list: list,
    gene_filter_rate: float = 0.1,
    cell_filter_UMI: int = 10000,
) -> tuple:
    """Preprocess adata and get ready for TF-target gene analysis.

    Args:
        adata: Adata object that includes both newly synthesized and total gene expression of cells. Alternatively,
            the object should include both unspliced and spliced gene expression of cells.
        nt_layers: The layers that will be used for the network inference. Note that the layers can be changed flexibly.
        TF_list: The list of transcription factors.
        gene_filter_rate: Minimum percentage of expressed cells for gene filtering.
        cell_filter_UMI: Minimum number of UMIs for cell filtering.

    Returns:
        A tuple of the following:
            - tot_mat: The total gene expression matrix.
            - new_mat: The newly synthesized gene expression matrix.
            - obs: The observation dataframe.
            - var: The variable dataframe.
            - var_TF: The variable matrix for transcription factors.
            - TF_matrix: The matrix of transcription factors.
    """

    n_obs, n_var = adata.n_obs, adata.n_vars

    # filter genes
    print(f"Original gene number: {n_var}")

    gene_filter_new = (adata.layers[nt_layers[0]] > 0).sum(0) > (gene_filter_rate * n_obs)
    gene_filter_tot = (adata.layers[nt_layers[1]] > 0).sum(0) > (gene_filter_rate * n_obs)
    if issparse(adata.layers[nt_layers[0]]):
        gene_filter_new = gene_filter_new.A1
    if issparse(adata.layers[nt_layers[1]]):
        gene_filter_tot = gene_filter_tot.A1
    adata = adata[:, gene_filter_new * gene_filter_tot]

    print(f"Gene number after filtering: {sum(gene_filter_new * gene_filter_tot)}")

    # filter cells
    print(f"Original cell number: {n_obs}")

    cell_filter = adata.layers[nt_layers[1]].sum(1) > cell_filter_UMI
    if issparse(adata.layers[nt_layers[1]]):
        cell_filter = cell_filter.A1
    adata = adata[cell_filter, :]

    print(f"Cell number after filtering: {adata.n_obs}")

    # generate the expression matrix for downstream analysis
    new = adata.layers[nt_layers[0]]
    total = adata.layers[nt_layers[1]]

    # recalculate size factor
    from ..preprocessing.deprecated import calc_sz_factor_legacy

    adata = calc_sz_factor_legacy(
        adata,
        method="mean-geometric-mean-total",
        round_exprs=True,
        total_layers=["total"],
    )
    szfactors = adata.obs["Size_Factor"][:, None]

    # normalize data (size factor correction, log transform and the scaling)
    if issparse(new):
        new = new.A
    if issparse(total):
        total = total.A
    new_mat = normalize_data(new, szfactors, pseudo_expr=0.1)
    tot_mat = normalize_data(total, szfactors, pseudo_expr=0.1)
    new_mat = pd.DataFrame(new_mat, index=adata.obs_names, columns=adata.var_names)
    tot_mat = pd.DataFrame(tot_mat, index=adata.obs_names, columns=adata.var_names)

    # compute the labeling reads rate in each cell
    obs = adata.obs
    var = adata.var
    var.loc[:, "gene_short_name"] = make_index_unique(var.loc[:, "gene_short_name"].astype("str"))
    ntr = adata.layers["new"].sum(1).A1 / adata.layers["total"].sum(1).A1
    if issparse(ntr):
        obs.loc[:, "labeling_rate"] = ntr.A1
    else:
        obs.loc[:, "labeling_rate"] = ntr

    # extract the TF matrix
    var_TF = var.query("gene_short_name in @TF_list")
    print(f"\nNumber of TFs found in the list: {var_TF.shape[0]}")

    TF_matrix = tot_mat.loc[:, var_TF.loc[:, "gene_id"]]
    TF_matrix.columns = var_TF.loc[:, "gene_short_name"]

    return (tot_mat.T, new_mat.T, obs, var, var_TF, TF_matrix.T)


def link_TF_gene_analysis(
    TF_matrix: pd.DataFrame,
    gene_matrix: pd.DataFrame,
    var_TF: pd.DataFrame,
    core_num: int = 1,
    cor_thresh: float = 0.03,
    seed: int = 123456,
) -> list:
    """Perform lasso regression for each gene.

    Args:
        TF_matrix: The dataframe of transcription factors.
        gene_matrix: The dataframe of genes.
        var_TF: The variable dataframe for transcription factors.
        core_num: Number of cores for lasso regression in linkage analysis. By default, it is 1 and parallel is
            turned off. Parallel computing can significantly speed up the computation process, especially for datasets
            involve many cells or genes. But for smaller datasets or genes, it could result in a reduction in speed due
            to the additional overhead. User discretion is advised.
        cor_thresh: The threshold for the correlation.
        seed: The random seed.

    Returns:
        A list of the results of the lasso regression for each gene.
    """

    gene_list = gene_matrix.index
    link_result = [None] * len(gene_list)

    for i, linked_gene in tqdm(enumerate(gene_list), desc=f"Perform lasso regression for each gene:"):
        gene_name, TFs = (
            var_TF.loc[linked_gene, "gene_short_name"] if linked_gene in var_TF.index else None,
            TF_matrix.index,
        )
        valid_gene_ids = TFs if gene_name is None else list(TFs.difference(set(gene_name)))
        input_TF_matrix = TF_matrix.loc[valid_gene_ids, :]

        result = TF_gene_link(
            input_TF_matrix,
            linked_gene,
            gene_matrix.loc[linked_gene, :],
            core_num=core_num,
            cor_thresh=cor_thresh,
            seed=seed,
        )
        link_result[i] = result

    return link_result


def TF_gene_link(
    TF_matrix: pd.DataFrame,
    linked_gene: str,
    linked_gene_expr_vector: pd.Series,
    core_num: int = 1,
    cor_thresh: float = 0.03,
    seed: int = 123456,
) -> Union[pd.DataFrame, str]:
    """Estimate the regulatory weight of each TF to its potential targets via lasso regression for each gene.

    Args:
        TF_matrix: The dataframe of transcription factors.
        linked_gene: The name of the linked gene.
        linked_gene_expr_vector: The expression vector of the linked gene.
        core_num: Number of cores for lasso regression in linkage analysis. By default, it is 1 and parallel is
            turned off. Parallel computing can significantly speed up the computation process, especially for datasets
            involve many cells or genes. But for smaller datasets or genes, it could result in a reduction in speed due
            to the additional overhead. User discretion is advised.
        cor_thresh: The threshold for the correlation.
        seed: The random seed.

    Returns:
        The result of the lasso regression for the gene.
    """

    expr_cor = einsum_correlation(TF_matrix.values, linked_gene_expr_vector.values)[0]
    select_sites = abs(expr_cor) > cor_thresh
    TF_matrix = TF_matrix.iloc[select_sites, :]

    if select_sites.sum() > 1:
        TF_matrix = TF_matrix.T
        result = lasso_regression_expression(TF_matrix, linked_gene, linked_gene_expr_vector, seed, core_num)

        return result
    else:
        return "unknown"


def lasso_regression_expression(
    x1: pd.DataFrame,
    linked_gene: str,
    y: pd.Series,
    seed: int,
    parallel: int = 1,
) -> pd.DataFrame:
    """Lasso linear model with iterative fitting along a regularization path. Select best model is by cross-validation.

    Args:
        x1: The dataframe of the input matrix.
        linked_gene: The name of the linked gene.
        y: The expression vector of the linked gene.
        seed: The random seed.
        parallel: The number of cores for parallel computing.

    Returns:
        The result of the lasso regression.
    """

    x1 = x1.loc[:, ~x1.columns.duplicated(keep="first")]

    # the following code should match up that from cv.glmnet in R if the foldid is hard coded to be the same.
    # https://github.com/bbalasub1/glmnet_python/issues/45

    np.random.seed(seed)
    seq = np.linspace(np.log(0.001), np.log(10), 100)

    # alpha = 1 is required for lasso regression.
    cv_out = cvglmnet(
        x=x1.values.astype("float64"),
        y=y.values.astype("float64"),
        ptype="mse",
        alpha=1,
        lambdau=np.exp(seq),
        parallel=parallel,
    )
    r2 = r2_glmnet(cv_out, y)

    bestlam = cv_out["lambda_1se"]

    # this rescaling ensures returning very similar results to R's cv.glm
    cor_list = cvglmnetCoef(cv_out, s=bestlam).flatten() / np.sqrt(x1.shape[0])
    df_cor = pd.DataFrame(
        {
            "id": x1.columns,
            "corcoef": cor_list[1:],  # ignore the first intercept term
            "r_squre": r2,
            "linked_gene": linked_gene,
        }
    )

    return df_cor


def r2_glmnet(cv_out: Dict, y: pd.Series) -> float:
    """Calculate r2 using the lambda_1se. This value is for the most regularized model whose mean squared error is
    within one standard error of the minimal.

    Args:
        cv_out: The output of the cross-validation.
        y: The expression vector of the linked gene.

    Returns:
        The r2 value.
    """

    # https://stackoverflow.com/questions/50610895/how-to-calculate-r-squared-value-for-lasso-regression-using-glmnet-in-r
    bestlam = cv_out["lambda_1se"]
    i = np.where(cv_out["lambdau"] == bestlam)[0]
    e = cv_out["cvm"][i]
    r2 = 1 - e / np.var(y)
    if r2 < 0:
        r2 = [0]

    return r2[0]


def TF_gene_filter_links(
    raw_glmnet_res: pd.DataFrame,
    var: pd.DataFrame,
    df_gene_TF_link_ENCODE: pd.DataFrame,
) -> pd.DataFrame:
    """Prepare data for TF-target gene link filtering.

    Args:
        raw_glmnet_res: The raw result of glmnet.
        var: The variable dataframe.
        df_gene_TF_link_ENCODE: The dataframe of the TF chip-seq data.

    Returns:
        The filtered result of the lasso regression.
    """

    var_ori = var.copy()
    if "gene_type" not in var.columns:
        var["gene_type"] = None
    df_gene = var.loc[:, ["gene_id", "gene_short_name", "gene_type"]]
    df_gene.columns = ["linked_gene", "linked_gene_name", "gene_type"]

    raw_glmnet_res_var = raw_glmnet_res.merge(df_gene)
    raw_glmnet_res_var = raw_glmnet_res_var.query("id != 'labeling_ratio' and id != 'new_size_factor'")

    # filter the link by ChIP-seq analysis
    df_gene_TF_link_chip = TF_link_gene_chip(raw_glmnet_res_var, df_gene_TF_link_ENCODE, var_ori, cor_thresh=0.04)

    return df_gene_TF_link_chip