Source code for dynamo.external.scifate

# integrate with Scribe (Qiu, et. al, Cell Systems, 2020) next
# the following code is based on Cao, et. al, Nature Biotechnology, 2020 and
# https://github.com/JunyueC/sci-fate_analysis

from typing import Union

import numpy as np
import pandas as pd
from anndata import AnnData
from anndata.utils import make_index_unique
from scipy.sparse import issparse
from tqdm import tqdm

from ..tools.utils import einsum_correlation
from .utils import TF_link_gene_chip, normalize_data


[docs]def scifate_glmnet( adata: AnnData, gene_filter_rate: float = 0.1, cell_filter_UMI: int = 10000, core_n_lasso: int = 1, core_n_filtering: int = 1, motif_ref: str = "https://www.dropbox.com/s/s8em539ojl55kgf/motifAnnotations_hgnc.csv?dl=1", TF_link_ENCODE_ref: str = "https://www.dropbox.com/s/bjuope41pte7mf4/df_gene_TF_link_ENCODE.csv?dl=1", nt_layers: list = ["X_new", "X_total"], ) -> AnnData: """Reconstruction of regulatory network (Cao, et. al, Nature Biotechnology, 2020) from TFs to other target genes via LASSO regression between the total expression of known transcription factors and the newly synthesized RNA of potential targets. The inferred regulatory relationships between TF and targets are further filtered based on evidence of promoter motifs (not implemented currently) and the ENCODE chip-seq peaks. The python wrapper for the glmnet FORTRON code, glm-python (https://github.com/bbalasub1/glmnet_python) was used. More details on lasso regression with glm-python can be found here (https://github.com/bbalasub1/glmnet_python/blob/master/test/glmnet_examples.ipynb). Note that this function can be applied to both of the metabolic labeling single-cell assays with newly synthesized and total RNA as well as the regular single cell assays with both the unspliced and spliced transcripts. Furthermore, you can also replace the either the new or unspliced RNA with dynamo estimated cell-wise velocity, transcription, splicing and degradation rates for each gene (similarly, replacing the expression values of transcription factors with RNA binding, ribosome, epigenetics or epitranscriptomic factors, etc.) to infer the tottal regulatory effects, transcription, splicing and post-transcriptional regulation of different factors. In addition, this approach will be fully integrated with Scribe (Qiu, et. al, 2020) which employs restricted directed information to determine causality by estimating the strength of information transferred from a potential regulator to its downstream target. In contrast of lasso regression, Scribe can learn both linear and non-linear causality in deterministic and stochastic systems. It also incorporates rigorous procedures to alleviate sampling bias and builds upon novel estimators and regularization techniques to facilitate inference of large-scale causal networks. Parameters ---------- adata: :class:`~anndata.AnnData`. adata object that includes both newly synthesized and total gene expression of cells. Alternatively, the object should include both unspliced and spliced gene expression of cells. gene_filter_rate: minimum percentage of expressed cells for gene filtering. cell_filter_UMI: minimum number of UMIs for cell filtering. core_n_lasso: number of cores for lasso regression in linkage analysis. By default, it is 1 and parallel is turned off. Parallel computing can significantly speed up the computation process, especially for datasets involve many cells or genes. But for smaller datasets or genes, it could result in a reduction in speed due to the additional overhead. User discretion is advised. core_n_filtering: number of cores for filtering TF-gene links. Not used currently. motif_ref: The path to the TF binding motif data as described above. It provides the list of TFs gene names and is used to process adata object to generate the TF expression and target new expression matrix for glmnet based TF-target synthesis rate linkage analysis. But currently it is not used for motif based filtering. By default it is a dropbox link that store the data from us. Other motif reference can bed downloaded from RcisTarget: https://resources.aertslab.org/cistarget/. For human motif matrix, it can be downloaded from June's shared folder: https://shendure-web.gs.washington.edu/content/members/cao1025/public/nobackup/sci_fate/data/hg19-tss-centered-10kb-7species.mc9nr.feather TF_link_ENCODE_ref: The path to the TF chip-seq data. By default it is a dropbox link from us that stores the data. Other data can be downloaded from: https://amp.pharm.mssm.edu/Harmonizome/dataset/ENCODE+Transcription+Factor+Targets. nt_layers: The layers that will be used for the network inference. Note that the layers can be changed flexibly. See the description of this function above. Note that if your internet connection is slow, we recommend to download the `motif_ref` and `TF_link_ENCODE_ref` and supplies those two arguments with the local paths where the downloaded datasets are saved. Returns ------- An updated adata object with a new key `scifate` in .uns attribute, which stores the raw lasso regression results and the filter results after applying the Fisher exact test of the ChIP-seq peaks. """ try: import glmnet_python global cvglmnet, cvglmnetCoef from glmnet_python import cvglmnet, cvglmnetCoef except ImportError: raise ImportError( "You need to install the package `glmnet_python`." "The original version https://github.com/bbalasub1/glmnet_python is not updated." "Plelease install johnlees's fork https://github.com/johnlees/glmnet_python." "Also check this pull request for more details: " "https://github.com/bbalasub1/glmnet_python/pull/47" ) df_gene_TF_link_ENCODE = pd.read_csv(TF_link_ENCODE_ref, sep="\t") motifAnnotations_hgnc = pd.read_csv(motif_ref, sep="\t") TF_list = motifAnnotations_hgnc.loc[:, "TF"] _, new_mat, obs, var, var_TF, TF_matrix = adata_processing_TF_link( adata, nt_layers, TF_list, gene_filter_rate, cell_filter_UMI ) # Apply LASSO for TF-gene linkage analysis new_size_factor = obs.loc[:, "Size_Factor"] new_size_factor = (new_size_factor - np.mean(new_size_factor)) / np.std(new_size_factor) TF_matrix.loc["new_size_factor"] = new_size_factor labeling_rate = obs["labeling_rate"] if np.std(labeling_rate) != 0: labeling_ratio = (labeling_rate - np.mean(labeling_rate)) / np.std(labeling_rate) TF_matrix.loc["labeling_ratio", :] = labeling_ratio # link TF and genes based on covariance link_result = link_TF_gene_analysis(TF_matrix, new_mat, var_TF, core_num=core_n_lasso) link_result = pd.concat([i if type(i) != str else None for i in link_result], axis=0) # filtering the links using TF-gene binding data and store the result in the target folder # note that currently the motif filtering is not implement df_gene_TF_link_chip = TF_gene_filter_links(link_result, var, core_n_filtering, motif_ref, df_gene_TF_link_ENCODE) adata.uns["scifate"] = { "glmnet_res": link_result, "glmnet_chip_filter": df_gene_TF_link_chip, } return adata
def adata_processing_TF_link( adata: AnnData, nt_layers: list, TF_list: list, gene_filter_rate: float = 0.1, cell_filter_UMI: int = 10000, ) -> tuple: """preprocess adata and get ready for TF-target gene analysis""" n_obs, n_var = adata.n_obs, adata.n_vars # filter genes print(f"Original gene number: {n_var}") gene_filter_new = (adata.layers[nt_layers[0]] > 0).sum(0) > (gene_filter_rate * n_obs) gene_filter_tot = (adata.layers[nt_layers[1]] > 0).sum(0) > (gene_filter_rate * n_obs) if issparse(adata.layers[nt_layers[0]]): gene_filter_new = gene_filter_new.A1 if issparse(adata.layers[nt_layers[1]]): gene_filter_tot = gene_filter_tot.A1 adata = adata[:, gene_filter_new * gene_filter_tot] print(f"Gene number after filtering: {sum(gene_filter_new * gene_filter_tot)}") # filter cells print(f"Original cell number: {n_obs}") cell_filter = adata.layers[nt_layers[1]].sum(1) > cell_filter_UMI if issparse(adata.layers[nt_layers[1]]): cell_filter = cell_filter.A1 adata = adata[cell_filter, :] print(f"Cell number after filtering: {adata.n_obs}") # generate the expression matrix for downstream analysis new = adata.layers[nt_layers[0]] total = adata.layers[nt_layers[1]] # recalculate size factor from ..preprocessing import calc_sz_factor_legacy adata = calc_sz_factor_legacy( adata, method="mean-geometric-mean-total", round_exprs=True, total_layers=["total"], ) szfactors = adata.obs["Size_Factor"][:, None] # normalize data (size factor correction, log transform and the scaling) if issparse(new): new = new.A if issparse(total): total = total.A new_mat = normalize_data(new, szfactors, pseudo_expr=0.1) tot_mat = normalize_data(total, szfactors, pseudo_expr=0.1) new_mat = pd.DataFrame(new_mat, index=adata.obs_names, columns=adata.var_names) tot_mat = pd.DataFrame(tot_mat, index=adata.obs_names, columns=adata.var_names) # compute the labeling reads rate in each cell obs = adata.obs var = adata.var var.loc[:, "gene_short_name"] = make_index_unique(var.loc[:, "gene_short_name"].astype("str")) ntr = adata.layers["new"].sum(1).A1 / adata.layers["total"].sum(1).A1 if issparse(ntr): obs.loc[:, "labeling_rate"] = ntr.A1 else: obs.loc[:, "labeling_rate"] = ntr # extract the TF matrix var_TF = var.query("gene_short_name in @TF_list") print(f"\nNumber of TFs found in the list: {var_TF.shape[0]}") TF_matrix = tot_mat.loc[:, var_TF.loc[:, "gene_id"]] TF_matrix.columns = var_TF.loc[:, "gene_short_name"] return (tot_mat.T, new_mat.T, obs, var, var_TF, TF_matrix.T) def link_TF_gene_analysis( TF_matrix: pd.DataFrame, gene_matrix: pd.DataFrame, var_TF: pd.DataFrame, core_num: int = 1, cor_thresh: float = 0.03, seed: int = 123456, ) -> list: """Perform lasso regression for each gene.""" gene_list = gene_matrix.index link_result = [None] * len(gene_list) for i, linked_gene in tqdm(enumerate(gene_list), desc=f"Perform lasso regression for each gene:"): gene_name, TFs = ( var_TF.loc[linked_gene, "gene_short_name"] if linked_gene in var_TF.index else None, TF_matrix.index, ) valid_gene_ids = TFs if gene_name is None else list(TFs.difference(set(gene_name))) input_TF_matrix = TF_matrix.loc[valid_gene_ids, :] result = TF_gene_link( input_TF_matrix, linked_gene, gene_matrix.loc[linked_gene, :], core_num=core_num, cor_thresh=cor_thresh, seed=seed, ) link_result[i] = result return link_result def TF_gene_link( TF_matrix: pd.DataFrame, linked_gene: str, linked_gene_expr_vector, core_num: int = 1, cor_thresh: float = 0.03, seed: int = 123456, ): """Estimate the regulatory weight of each TF to its potential targets via lasso regression for each gene.""" expr_cor = einsum_correlation(TF_matrix.values, linked_gene_expr_vector.values)[0] select_sites = abs(expr_cor) > cor_thresh TF_matrix = TF_matrix.iloc[select_sites, :] if select_sites.sum() > 1: TF_matrix = TF_matrix.T result = lasso_regression_expression(TF_matrix, linked_gene, linked_gene_expr_vector, seed, core_num) return result else: return "unknown" def lasso_regression_expression(x1, linked_gene, y, seed: int, parallel=1): """Lasso linear model with iterative fitting along a regularization path. Select best model is by cross-validation.""" x1 = x1.loc[:, ~x1.columns.duplicated(keep="first")] # the following code should match up that from cv.glmnet in R if the foldid is hard coded to be the same. # https://github.com/bbalasub1/glmnet_python/issues/45 np.random.seed(seed) seq = np.linspace(np.log(0.001), np.log(10), 100) # alpha = 1 is required for lasso regression. cv_out = cvglmnet( x=x1.values.astype("float64"), y=y.values.astype("float64"), ptype="mse", alpha=1, lambdau=np.exp(seq), parallel=parallel, ) r2 = r2_glmnet(cv_out, y) bestlam = cv_out["lambda_1se"] # this rescaling ensures returning very similar results to R's cv.glm cor_list = cvglmnetCoef(cv_out, s=bestlam).flatten() / np.sqrt(x1.shape[0]) df_cor = pd.DataFrame( { "id": x1.columns, "corcoef": cor_list[1:], # ignore the first intercept term "r_squre": r2, "linked_gene": linked_gene, } ) return df_cor def r2_glmnet(cv_out, y): """calculate r2 using the lambda_1se. This value is for the most regularized model whose mean squared error is within one standard error of the minimal.""" # https://stackoverflow.com/questions/50610895/how-to-calculate-r-squared-value-for-lasso-regression-using-glmnet-in-r bestlam = cv_out["lambda_1se"] i = np.where(cv_out["lambdau"] == bestlam)[0] e = cv_out["cvm"][i] r2 = 1 - e / np.var(y) if r2 < 0: r2 = [0] return r2[0] def TF_gene_filter_links(raw_glmnet_res: pd.DataFrame, var, core_n, motif_ref, df_gene_TF_link_ENCODE): """prepare data for TF-target gene link filtering""" var_ori = var.copy() if "gene_type" not in var.columns: var["gene_type"] = None df_gene = var.loc[:, ["gene_id", "gene_short_name", "gene_type"]] df_gene.columns = ["linked_gene", "linked_gene_name", "gene_type"] raw_glmnet_res_var = raw_glmnet_res.merge(df_gene) raw_glmnet_res_var = raw_glmnet_res_var.query("id != 'labeling_ratio' and id != 'new_size_factor'") # filter the link by ChIP-seq analysis df_gene_TF_link_chip = TF_link_gene_chip(raw_glmnet_res_var, df_gene_TF_link_ENCODE, var_ori, cor_thresh=0.04) return df_gene_TF_link_chip