# integrate with Scribe (Qiu, et. al, Cell Systems, 2020) next
# the following code is based on Cao, et. al, Nature Biotechnology, 2020 and
# https://github.com/JunyueC/sci-fate_analysis
from typing import Union
import numpy as np
import pandas as pd
from anndata import AnnData
from anndata.utils import make_index_unique
from scipy.sparse import issparse
from tqdm import tqdm
from ..tools.utils import einsum_correlation
from .utils import TF_link_gene_chip, normalize_data
[docs]def scifate_glmnet(
adata: AnnData,
gene_filter_rate: float = 0.1,
cell_filter_UMI: int = 10000,
core_n_lasso: int = 1,
core_n_filtering: int = 1,
motif_ref: str = "https://www.dropbox.com/s/s8em539ojl55kgf/motifAnnotations_hgnc.csv?dl=1",
TF_link_ENCODE_ref: str = "https://www.dropbox.com/s/bjuope41pte7mf4/df_gene_TF_link_ENCODE.csv?dl=1",
nt_layers: list = ["X_new", "X_total"],
) -> AnnData:
"""Reconstruction of regulatory network (Cao, et. al, Nature Biotechnology, 2020) from TFs to other target
genes via LASSO regression between the total expression of known transcription factors and the newly synthesized
RNA of potential targets. The inferred regulatory relationships between TF and targets are further filtered based
on evidence of promoter motifs (not implemented currently) and the ENCODE chip-seq peaks. The python wrapper for the
glmnet FORTRON code, glm-python (https://github.com/bbalasub1/glmnet_python) was used. More details on lasso regression
with glm-python can be found here (https://github.com/bbalasub1/glmnet_python/blob/master/test/glmnet_examples.ipynb).
Note that this function can be applied to both of the metabolic labeling single-cell assays with newly synthesized
and total RNA as well as the regular single cell assays with both the unspliced and spliced transcripts. Furthermore,
you can also replace the either the new or unspliced RNA with dynamo estimated cell-wise velocity, transcription,
splicing and degradation rates for each gene (similarly, replacing the expression values of transcription factors with RNA binding,
ribosome, epigenetics or epitranscriptomic factors, etc.) to infer the tottal regulatory effects, transcription, splicing
and post-transcriptional regulation of different factors. In addition, this approach will be fully integrated with
Scribe (Qiu, et. al, 2020) which employs restricted directed information to determine causality by estimating the
strength of information transferred from a potential regulator to its downstream target. In contrast of lasso regression,
Scribe can learn both linear and non-linear causality in deterministic and stochastic systems. It also incorporates
rigorous procedures to alleviate sampling bias and builds upon novel estimators and regularization techniques to
facilitate inference of large-scale causal networks.
Parameters
----------
adata: :class:`~anndata.AnnData`.
adata object that includes both newly synthesized and total gene expression of cells. Alternatively,
the object should include both unspliced and spliced gene expression of cells.
gene_filter_rate:
minimum percentage of expressed cells for gene filtering.
cell_filter_UMI:
minimum number of UMIs for cell filtering.
core_n_lasso:
number of cores for lasso regression in linkage analysis. By default, it is 1 and parallel is turned off.
Parallel computing can significantly speed up the computation process, especially for datasets involve
many cells or genes. But for smaller datasets or genes, it could result in a reduction in speed due to the
additional overhead. User discretion is advised.
core_n_filtering:
number of cores for filtering TF-gene links. Not used currently.
motif_ref:
The path to the TF binding motif data as described above. It provides the list of TFs gene names and
is used to process adata object to generate the TF expression and target new expression matrix for glmnet
based TF-target synthesis rate linkage analysis. But currently it is not used for motif based filtering.
By default it is a dropbox link that store the data from us. Other motif reference can bed downloaded from RcisTarget:
https://resources.aertslab.org/cistarget/. For human motif matrix, it can be downloaded from June's shared folder:
https://shendure-web.gs.washington.edu/content/members/cao1025/public/nobackup/sci_fate/data/hg19-tss-centered-10kb-7species.mc9nr.feather
TF_link_ENCODE_ref:
The path to the TF chip-seq data. By default it is a dropbox link from us that stores the data. Other data can
be downloaded from: https://amp.pharm.mssm.edu/Harmonizome/dataset/ENCODE+Transcription+Factor+Targets.
nt_layers:
The layers that will be used for the network inference. Note that the layers can be changed flexibly. See
the description of this function above.
Note that if your internet connection is slow, we recommend to download the `motif_ref` and `TF_link_ENCODE_ref` and
supplies those two arguments with the local paths where the downloaded datasets are saved.
Returns
-------
An updated adata object with a new key `scifate` in .uns attribute, which stores the raw lasso regression results
and the filter results after applying the Fisher exact test of the ChIP-seq peaks.
"""
try:
import glmnet_python
global cvglmnet, cvglmnetCoef
from glmnet_python import cvglmnet, cvglmnetCoef
except ImportError:
raise ImportError(
"You need to install the package `glmnet_python`."
"The original version https://github.com/bbalasub1/glmnet_python is not updated."
"Plelease install johnlees's fork https://github.com/johnlees/glmnet_python."
"Also check this pull request for more details: "
"https://github.com/bbalasub1/glmnet_python/pull/47"
)
df_gene_TF_link_ENCODE = pd.read_csv(TF_link_ENCODE_ref, sep="\t")
motifAnnotations_hgnc = pd.read_csv(motif_ref, sep="\t")
TF_list = motifAnnotations_hgnc.loc[:, "TF"]
_, new_mat, obs, var, var_TF, TF_matrix = adata_processing_TF_link(
adata, nt_layers, TF_list, gene_filter_rate, cell_filter_UMI
)
# Apply LASSO for TF-gene linkage analysis
new_size_factor = obs.loc[:, "Size_Factor"]
new_size_factor = (new_size_factor - np.mean(new_size_factor)) / np.std(new_size_factor)
TF_matrix.loc["new_size_factor"] = new_size_factor
labeling_rate = obs["labeling_rate"]
if np.std(labeling_rate) != 0:
labeling_ratio = (labeling_rate - np.mean(labeling_rate)) / np.std(labeling_rate)
TF_matrix.loc["labeling_ratio", :] = labeling_ratio
# link TF and genes based on covariance
link_result = link_TF_gene_analysis(TF_matrix, new_mat, var_TF, core_num=core_n_lasso)
link_result = pd.concat([i if type(i) != str else None for i in link_result], axis=0)
# filtering the links using TF-gene binding data and store the result in the target folder
# note that currently the motif filtering is not implement
df_gene_TF_link_chip = TF_gene_filter_links(link_result, var, core_n_filtering, motif_ref, df_gene_TF_link_ENCODE)
adata.uns["scifate"] = {
"glmnet_res": link_result,
"glmnet_chip_filter": df_gene_TF_link_chip,
}
return adata
def adata_processing_TF_link(
adata: AnnData,
nt_layers: list,
TF_list: list,
gene_filter_rate: float = 0.1,
cell_filter_UMI: int = 10000,
) -> tuple:
"""preprocess adata and get ready for TF-target gene analysis"""
n_obs, n_var = adata.n_obs, adata.n_vars
# filter genes
print(f"Original gene number: {n_var}")
gene_filter_new = (adata.layers[nt_layers[0]] > 0).sum(0) > (gene_filter_rate * n_obs)
gene_filter_tot = (adata.layers[nt_layers[1]] > 0).sum(0) > (gene_filter_rate * n_obs)
if issparse(adata.layers[nt_layers[0]]):
gene_filter_new = gene_filter_new.A1
if issparse(adata.layers[nt_layers[1]]):
gene_filter_tot = gene_filter_tot.A1
adata = adata[:, gene_filter_new * gene_filter_tot]
print(f"Gene number after filtering: {sum(gene_filter_new * gene_filter_tot)}")
# filter cells
print(f"Original cell number: {n_obs}")
cell_filter = adata.layers[nt_layers[1]].sum(1) > cell_filter_UMI
if issparse(adata.layers[nt_layers[1]]):
cell_filter = cell_filter.A1
adata = adata[cell_filter, :]
print(f"Cell number after filtering: {adata.n_obs}")
# generate the expression matrix for downstream analysis
new = adata.layers[nt_layers[0]]
total = adata.layers[nt_layers[1]]
# recalculate size factor
from ..preprocessing.deprecated import calc_sz_factor_legacy
adata = calc_sz_factor_legacy(
adata,
method="mean-geometric-mean-total",
round_exprs=True,
total_layers=["total"],
)
szfactors = adata.obs["Size_Factor"][:, None]
# normalize data (size factor correction, log transform and the scaling)
if issparse(new):
new = new.A
if issparse(total):
total = total.A
new_mat = normalize_data(new, szfactors, pseudo_expr=0.1)
tot_mat = normalize_data(total, szfactors, pseudo_expr=0.1)
new_mat = pd.DataFrame(new_mat, index=adata.obs_names, columns=adata.var_names)
tot_mat = pd.DataFrame(tot_mat, index=adata.obs_names, columns=adata.var_names)
# compute the labeling reads rate in each cell
obs = adata.obs
var = adata.var
var.loc[:, "gene_short_name"] = make_index_unique(var.loc[:, "gene_short_name"].astype("str"))
ntr = adata.layers["new"].sum(1).A1 / adata.layers["total"].sum(1).A1
if issparse(ntr):
obs.loc[:, "labeling_rate"] = ntr.A1
else:
obs.loc[:, "labeling_rate"] = ntr
# extract the TF matrix
var_TF = var.query("gene_short_name in @TF_list")
print(f"\nNumber of TFs found in the list: {var_TF.shape[0]}")
TF_matrix = tot_mat.loc[:, var_TF.loc[:, "gene_id"]]
TF_matrix.columns = var_TF.loc[:, "gene_short_name"]
return (tot_mat.T, new_mat.T, obs, var, var_TF, TF_matrix.T)
def link_TF_gene_analysis(
TF_matrix: pd.DataFrame,
gene_matrix: pd.DataFrame,
var_TF: pd.DataFrame,
core_num: int = 1,
cor_thresh: float = 0.03,
seed: int = 123456,
) -> list:
"""Perform lasso regression for each gene."""
gene_list = gene_matrix.index
link_result = [None] * len(gene_list)
for i, linked_gene in tqdm(enumerate(gene_list), desc=f"Perform lasso regression for each gene:"):
gene_name, TFs = (
var_TF.loc[linked_gene, "gene_short_name"] if linked_gene in var_TF.index else None,
TF_matrix.index,
)
valid_gene_ids = TFs if gene_name is None else list(TFs.difference(set(gene_name)))
input_TF_matrix = TF_matrix.loc[valid_gene_ids, :]
result = TF_gene_link(
input_TF_matrix,
linked_gene,
gene_matrix.loc[linked_gene, :],
core_num=core_num,
cor_thresh=cor_thresh,
seed=seed,
)
link_result[i] = result
return link_result
def TF_gene_link(
TF_matrix: pd.DataFrame,
linked_gene: str,
linked_gene_expr_vector,
core_num: int = 1,
cor_thresh: float = 0.03,
seed: int = 123456,
):
"""Estimate the regulatory weight of each TF to its potential targets via lasso regression for each gene."""
expr_cor = einsum_correlation(TF_matrix.values, linked_gene_expr_vector.values)[0]
select_sites = abs(expr_cor) > cor_thresh
TF_matrix = TF_matrix.iloc[select_sites, :]
if select_sites.sum() > 1:
TF_matrix = TF_matrix.T
result = lasso_regression_expression(TF_matrix, linked_gene, linked_gene_expr_vector, seed, core_num)
return result
else:
return "unknown"
def lasso_regression_expression(x1, linked_gene, y, seed: int, parallel=1):
"""Lasso linear model with iterative fitting along a regularization path. Select best model is by cross-validation."""
x1 = x1.loc[:, ~x1.columns.duplicated(keep="first")]
# the following code should match up that from cv.glmnet in R if the foldid is hard coded to be the same.
# https://github.com/bbalasub1/glmnet_python/issues/45
np.random.seed(seed)
seq = np.linspace(np.log(0.001), np.log(10), 100)
# alpha = 1 is required for lasso regression.
cv_out = cvglmnet(
x=x1.values.astype("float64"),
y=y.values.astype("float64"),
ptype="mse",
alpha=1,
lambdau=np.exp(seq),
parallel=parallel,
)
r2 = r2_glmnet(cv_out, y)
bestlam = cv_out["lambda_1se"]
# this rescaling ensures returning very similar results to R's cv.glm
cor_list = cvglmnetCoef(cv_out, s=bestlam).flatten() / np.sqrt(x1.shape[0])
df_cor = pd.DataFrame(
{
"id": x1.columns,
"corcoef": cor_list[1:], # ignore the first intercept term
"r_squre": r2,
"linked_gene": linked_gene,
}
)
return df_cor
def r2_glmnet(cv_out, y):
"""calculate r2 using the lambda_1se. This value is for the most regularized model whose mean squared error is
within one standard error of the minimal."""
# https://stackoverflow.com/questions/50610895/how-to-calculate-r-squared-value-for-lasso-regression-using-glmnet-in-r
bestlam = cv_out["lambda_1se"]
i = np.where(cv_out["lambdau"] == bestlam)[0]
e = cv_out["cvm"][i]
r2 = 1 - e / np.var(y)
if r2 < 0:
r2 = [0]
return r2[0]
def TF_gene_filter_links(raw_glmnet_res: pd.DataFrame, var, core_n, motif_ref, df_gene_TF_link_ENCODE):
"""prepare data for TF-target gene link filtering"""
var_ori = var.copy()
if "gene_type" not in var.columns:
var["gene_type"] = None
df_gene = var.loc[:, ["gene_id", "gene_short_name", "gene_type"]]
df_gene.columns = ["linked_gene", "linked_gene_name", "gene_type"]
raw_glmnet_res_var = raw_glmnet_res.merge(df_gene)
raw_glmnet_res_var = raw_glmnet_res_var.query("id != 'labeling_ratio' and id != 'new_size_factor'")
# filter the link by ChIP-seq analysis
df_gene_TF_link_chip = TF_link_gene_chip(raw_glmnet_res_var, df_gene_TF_link_ENCODE, var_ori, cor_thresh=0.04)
return df_gene_TF_link_chip