import numpy as np
import pandas as pd
import warnings
from scipy.sparse import issparse, csr_matrix
from sklearn.decomposition import FastICA
from sklearn.utils import sparsefuncs
from ..tools.utils import update_dict
from .utils import (
convert2gene_symbol,
pca,
clusters_stats,
cook_dist,
get_layer_keys,
get_shared_counts,
get_svr_filter,
Freeman_Tukey,
merge_adata_attrs,
sz_util,
normalize_util,
get_sz_exprs,
unique_var_obs_adata,
layers2csr,
collapse_adata,
NTR,
detect_datatype,
basic_stats,
add_noise_to_duplicates,
)
from .cell_cycle import cell_cycle_scores
def szFactor(
adata_ori,
layers="all",
total_layers=None,
locfunc=np.nanmean,
round_exprs=False,
method="median",
use_all_genes_cells=True,
):
"""Calculate the size factor of the each cell using geometric mean of total UMI across cells for a AnnData object.
This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3).
Parameters
----------
adata_ori: :class:`~anndata.AnnData`.
AnnData object.
layers: str or list (default: `all`)
The layer(s) to be normalized. Default is `all`, including RNA (X, raw) or spliced, unspliced, protein, etc.
total_layers: list or None (default `None`)
The layer(s) that can be summed up to get the total mRNA. for example, ["spliced", "unspliced"], ["uu", "ul", "su", "sl"] or ["new", "old"], etc.
locfunc: `function` (default: `np.nanmean`)
The function to normalize the data.
round_exprs: `bool` (default: `False`)
A logic flag to determine whether the gene expression should be rounded into integers.
method: `str` (default: `mean-geometric-mean-total`)
The method used to calculate the expected total reads / UMI used in size factor calculation.
Only `mean-geometric-mean-total` / `geometric` and `median` are supported. When `median` is used, `locfunc` will be replaced with
`np.nanmedian`.
use_all_genes_cells: `bool` (default: `True`)
A logic flag to determine whether all cells and genes should be used for the size factor calculation.
Returns
-------
adata: :AnnData
A updated anndata object that are updated with the `Size_Factor` (`layer_` + `Size_Factor`) column(s) in the obs attribute.
"""
if use_all_genes_cells:
adata = adata_ori
else:
cell_inds = adata_ori.obs.use_for_pca if 'use_for_pca' in adata_ori.obs.columns else adata_ori.obs.index
filter_list = ['use_for_pca', 'pass_basic_filter']
filter_checker = [i in adata_ori.var.columns for i in filter_list]
which_filter = np.where(filter_checker)[0]
gene_inds = adata_ori.var[filter_list[which_filter[0]]] if len(which_filter) > 0 else adata_ori.var.index
adata = adata_ori[cell_inds, :][:, gene_inds]
if total_layers is not None:
if not isinstance(total_layers, list): total_layers = [total_layers]
if len(set(total_layers).difference(adata.layers.keys())) == 0:
total = None
for t_key in total_layers:
total = (
adata.layers[t_key] if total is None else total + adata.layers[t_key]
)
adata.layers["_total_"] = total
layers = get_layer_keys(adata, layers)
if "raw" in layers and adata.raw is None:
adata.raw = adata.copy()
for layer in layers:
sfs, cell_total = sz_util(adata, layer, round_exprs, method, locfunc, total_layers=total_layers)
sfs[~np.isfinite(sfs)] = 1
if layer == "raw":
adata.obs[layer + "_Size_Factor"] = sfs
adata.obs["Size_Factor"] = sfs
adata.obs["initial_cell_size"] = cell_total
elif layer == "X":
adata.obs["Size_Factor"] = sfs
adata.obs["initial_cell_size"] = cell_total
elif layer == "_total_":
adata.obs["total_Size_Factor"] = sfs
adata.obs["initial" + layer + "cell_size"] = cell_total
del adata.layers["_total_"]
else:
adata.obs[layer + "_Size_Factor"] = sfs
adata.obs["initial_" + layer + "_cell_size"] = cell_total
adata_ori = merge_adata_attrs(adata_ori, adata, attr='obs')
return adata_ori
def normalize_expr_data(
adata,
layers="all",
total_szfactor='total_Size_Factor',
norm_method=None,
pseudo_expr=1,
relative_expr=True,
keep_filtered=True,
recalc_sz=False,
sz_method='median',
):
"""Normalize the gene expression value for the AnnData object
This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3).
Parameters
----------
adata: :class:`~anndata.AnnData`
AnnData object.
layers: `str` (default: `all`)
The layer(s) to be normalized. Default is all, including RNA (X, raw) or spliced, unspliced, protein, etc.
total_szfactor: `str` (default: `total_Size_Factor`)
The column name in the .obs attribute that corresponds to the size factor for the total mRNA.
norm_method: `function` or None (default: `None`)
The method used to normalize data. Can be either function `np.log1p, np.log2 or any other functions or string
`clr`. By default, only .X will be size normalized and log1p transformed while data in other layers will only
be size normalized.
pseudo_expr: `int` (default: `1`)
A pseudocount added to the gene expression value before log/log2 normalization.
relative_expr: `bool` (default: `True`)
A logic flag to determine whether we need to divide gene expression values first by size factor before normalization.
keep_filtered: `bool` (default: `True`)
A logic flag to determine whether we will only store feature genes in the adata object. If it is False, size factor
will be recalculated only for the selected feature genes.
recalc_sz: `bool` (default: `False`)
A logic flag to determine whether we need to recalculate size factor based on selected genes before normalization.
sz_method: `str` (default: `mean-geometric-mean-total`)
The method used to calculate the expected total reads / UMI used in size factor calculation.
Only `mean-geometric-mean-total` / `geometric` and `median` are supported. When `median` is used, `locfunc` will be replaced with
`np.nanmedian`.
Returns
-------
adata: :AnnData
A updated anndata object that are updated with normalized expression values for different layers.
"""
if recalc_sz:
if "use_for_pca" in adata.var.columns and keep_filtered is False:
adata = adata[:, adata.var.loc[:, "use_for_pca"]]
adata.obs = adata.obs.loc[:, ~adata.obs.columns.str.contains("Size_Factor")]
layers = get_layer_keys(adata, layers)
layer_sz_column_names = [i + "_Size_Factor" for i in set(layers).difference("X")]
layer_sz_column_names.extend(["Size_Factor"])
layers_to_sz = list(set(layer_sz_column_names).difference(adata.obs.keys()))
if len(layers_to_sz) > 0:
layers = (
pd.Series(layers_to_sz)
.str.split("_Size_Factor", expand=True)
.iloc[:, 0]
.tolist()
)
if "Size_Factor" in layers:
layers[np.where(np.array(layers) == "Size_Factor")[0][0]] = "X"
szFactor(
adata,
layers=layers,
locfunc=np.nanmean,
round_exprs=True,
method=sz_method,
)
for layer in layers:
szfactors, CM = get_sz_exprs(adata, layer, total_szfactor=total_szfactor)
if norm_method is None and layer == 'X':
CM = normalize_util(CM, szfactors, relative_expr, pseudo_expr, np.log1p)
elif norm_method in [np.log1p, np.log, np.log2, Freeman_Tukey, None] and layer != "protein":
CM = normalize_util(CM, szfactors, relative_expr, pseudo_expr, norm_method)
elif layer == "protein": # norm_method == 'clr':
if norm_method != "clr":
warnings.warn(
"For protein data, log transformation is not recommended. Using clr normalization by default."
)
"""This normalization implements the centered log-ratio (CLR) normalization from Seurat which is computed for
each gene (M Stoeckius, 2017).
"""
CM = CM.T
n_feature = CM.shape[1]
for i in range(CM.shape[0]):
x = CM[i].A if issparse(CM) else CM[i]
res = np.log1p(x / (np.exp(np.nansum(np.log1p(x[x > 0])) / n_feature)))
res[np.isnan(res)] = 0
# res[res > 100] = 100
CM[
i
] = res # no .A is required # https://stackoverflow.com/questions/28427236/set-row-of-csr-matrix
CM = CM.T
else:
warnings.warn(norm_method + " is not implemented yet")
if layer in ["raw", "X"]:
adata.X = CM
elif layer == "protein" and "protein" in adata.obsm_keys():
adata.obsm["X_protein"] = CM
else:
adata.layers["X_" + layer] = CM
adata.uns["pp_norm_method"] = norm_method.__name__ if callable(norm_method) else norm_method
return adata
def Gini(adata, layers="all"):
"""Calculate the Gini coefficient of a numpy array.
https://github.com/thomasmaxwellnorman/perturbseq_demo/blob/master/perturbseq/util.py
Parameters
----------
adata: :class:`~anndata.AnnData`
AnnData object
layers: str (default: None)
The layer(s) to be normalized. Default is all, including RNA (X, raw) or spliced, unspliced, protein, etc.
Returns
-------
adata: :AnnData
A updated anndata object with gini score for the layers (include .X) in the corresponding var columns (layer + '_gini').
"""
# From: https://github.com/oliviaguest/gini
# based on bottom eq: http://www.statsdirect.com/help/content/image/stat0206_wmf.gif
# from: http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
layers = get_layer_keys(adata, layers)
for layer in layers:
if layer == "raw":
CM = adata.raw.X
elif layer == "X":
CM = adata.X
elif layer == "protein":
if "protein" in adata.obsm_keys():
CM = adata.obsm[layer]
else:
continue
else:
CM = adata.layers[layer]
n_features = adata.shape[1]
gini = np.zeros(n_features)
for i in np.arange(n_features):
cur_cm = (
CM[:, i].A if issparse(CM) else CM[:, i]
) # all values are treated equally, arrays must be 1d
if np.amin(CM) < 0:
cur_cm -= np.amin(cur_cm) # values cannot be negative
cur_cm += 0.0000001 # np.min(array[array!=0]) #values cannot be 0
cur_cm = np.sort(cur_cm) # values must be sorted
index = np.arange(1, cur_cm.shape[0] + 1) # index per array element
n = cur_cm.shape[0] # number of array elements
gini[i] = (np.sum((2 * index - n - 1) * cur_cm)) / (
n * np.sum(cur_cm)
) # Gini coefficient
if layer in ["raw", "X"]:
adata.var["gini"] = gini
else:
adata.var[layer + "_gini"] = gini
return adata
def parametricDispersionFit(disp_table, initial_coefs=np.array([1e-6, 1])):
"""fThis function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3).
Parameters
----------
disp_table: :class:`~pandas.DataFrame`
AnnData object
initial_coefs: :class:`~numpy.ndarray`
Initial parameters for the gamma fit of the dispersion parameters.
Returns
-------
fit: :class:`~statsmodels.api.formula.glm`
A statsmodels fitting object.
coefs: :class:`~numpy.ndarray`
The two resulting gamma fitting coefficients.
good: :class:`~pandas.DataFrame`
The subsetted dispersion table that is subjected to Gamma fitting.
"""
import statsmodels.api as sm
coefs = initial_coefs
iter = 0
while True:
residuals = disp_table["disp"] / (coefs[0] + coefs[1] / disp_table["mu"])
good = disp_table.loc[(residuals > initial_coefs[0]) & (residuals < 10000), :]
# https://stats.stackexchange.com/questions/356053/the-identity-link-function-does-not-respect-the-domain-of-the-gamma-family
fit = sm.formula.glm(
"disp ~ I(1 / mu)",
data=good,
family=sm.families.Gamma(link=sm.genmod.families.links.identity),
).fit(start_params=coefs)
oldcoefs = coefs
coefs = fit.params
if coefs[0] < initial_coefs[0]:
coefs[0] = initial_coefs[0]
if coefs[1] < 0:
warnings.warn("Parametric dispersion fit may be failed.")
if np.sum(np.log(coefs / oldcoefs) ** 2 < coefs[0]):
break
iter += 1
if iter > 10:
warnings.warn("Dispersion fit didn't converge")
break
if not np.all(coefs > 0):
warnings.warn("Parametric dispersion fit may be failed.")
return fit, coefs, good
def disp_calc_helper_NB(adata, layers="X", min_cells_detected=1):
""" This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3).
Parameters
----------
adata: :class:`~anndata.AnnData`
AnnData object
min_cells_detected: `int` (default: None)
The mimimal required number of cells with expression for selecting gene for dispersion fitting.
layer: `str`
The layer of data used for dispersion fitting.
Returns
-------
res: :class:`~pandas.DataFrame`
A pandas dataframe with mu, dispersion for each gene that passes filters.
"""
layers = get_layer_keys(adata, layers=layers, include_protein=False)
res_list = []
for layer in layers:
if layer == "raw":
CM = adata.raw.X
szfactors = adata.obs[layer + "Size_Factor"][:, None]
elif layer == "X":
CM = adata.X
szfactors = adata.obs["Size_Factor"][:, None]
else:
CM = adata.layers[layer]
szfactors = adata.obs[layer + "Size_Factor"][:, None]
if issparse(CM):
CM.data = np.round(CM.data, 0)
rounded = CM
else:
rounded = CM.round().astype("int")
lowerDetectedLimit = (
adata.uns["lowerDetectedLimit"]
if "lowerDetectedLimit" in adata.uns.keys()
else 1
)
nzGenes = (rounded > lowerDetectedLimit).sum(axis=0)
nzGenes = nzGenes > min_cells_detected
nzGenes = nzGenes.A1 if issparse(rounded) else nzGenes
if layer.startswith("X_"):
x = rounded[:, nzGenes]
else:
x = (
rounded[:, nzGenes].multiply(csr_matrix(1 / szfactors))
if issparse(rounded)
else rounded[:, nzGenes] / szfactors
)
xim = np.mean(1 / szfactors) if szfactors is not None else 1
f_expression_mean = x.mean(axis=0)
# For NB: Var(Y) = mu * (1 + mu / k)
# x.A.var(axis=0, ddof=1)
f_expression_var = (
(x.multiply(x).mean(0).A1 - f_expression_mean.A1 ** 2)
* x.shape[0]
/ (x.shape[0] - 1)
if issparse(x)
else x.var(axis=0, ddof=0) ** 2
) # np.mean(np.power(x - f_expression_mean, 2), axis=0) # variance with n - 1
# https://scialert.net/fulltext/?doi=ajms.2010.1.15 method of moments
disp_guess_meth_moments = (
f_expression_var - xim * f_expression_mean
) # variance - mu
disp_guess_meth_moments = disp_guess_meth_moments / np.power(
f_expression_mean, 2
) # this is dispersion parameter (1/k)
res = pd.DataFrame(
{
"mu": np.array(f_expression_mean).flatten(),
"disp": np.array(disp_guess_meth_moments).flatten(),
}
)
res.loc[res["mu"] == 0, "mu"] = None
res.loc[res["mu"] == 0, "disp"] = None
res.loc[res["disp"] < 0, "disp"] = 0
res["gene_id"] = adata.var_names[nzGenes]
res_list.append(res)
return layers, res_list
def topTable(adata, layer="X", mode="dispersion"):
""" This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3).
Parameters
----------
adata: :class:`~anndata.AnnData`
AnnData object
Returns
-------
disp_df: :class:`~pandas.DataFrame`
The data frame with the gene_id, mean_expression, dispersion_fit and dispersion_empirical as the columns.
"""
layer = get_layer_keys(adata, layers=layer, include_protein=False)[0]
if layer in ["X"]:
key = "dispFitInfo"
else:
key = layer + "_dispFitInfo"
if mode == "dispersion":
if adata.uns[key] is None:
raise KeyError(
"Error: no dispersion model found. Please call estimateDispersions() before calling this function"
)
top_df = pd.DataFrame(
{
"gene_id": adata.uns[key]["disp_table"]["gene_id"],
"mean_expression": adata.uns[key]["disp_table"]["mu"],
"dispersion_fit": adata.uns[key]["disp_func"](
adata.uns[key]["disp_table"]["mu"]
),
"dispersion_empirical": adata.uns[key]["disp_table"]["disp"],
}
)
top_df = top_df.set_index("gene_id")
elif mode == "gini":
top_df = adata.var[layer + "_gini"]
return top_df
def vstExprs(adata, expr_matrix=None, round_vals=True):
""" This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3).
Parameters
----------
adata: :class:`~anndata.AnnData`
AnnData object
dispModelName: `str`
The name of the dispersion function to use for VST.
expr_matrix: :class:`~numpy.ndarray`
An matrix of values to transform. Must be normalized (e.g. by size factors) already. This function doesn't do this for you.
round_vals: `bool`
Whether to round expression values to the nearest integer before applying the transformation.
Returns
-------
res: :class:`~numpy.ndarray`
A numpy array of the gene expression after VST.
"""
fitInfo = adata.uns["dispFitInfo"]
coefs = fitInfo["coefs"]
if expr_matrix is None:
ncounts = adata.X
if round_vals:
if issparse(ncounts):
ncounts.data = np.round(ncounts.data, 0)
else:
ncounts = ncounts.round().astype("int")
else:
ncounts = expr_matrix
def vst(q): # c( "asymptDisp", "extraPois" )
return np.log(
(
1
+ coefs[1]
+ 2 * coefs[0] * q
+ 2 * np.sqrt(coefs[0] * q * (1 + coefs[1] + coefs[0] * q))
)
/ (4 * coefs[0])
) / np.log(2)
res = vst(ncounts.toarray()) if issparse(ncounts) else vst(ncounts)
return res
def Dispersion(
adata, layers="X", modelFormulaStr="~ 1", min_cells_detected=1, removeOutliers=False
):
""" This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3).
Parameters
----------
adata: :class:`~anndata.AnnData`
AnnData object
layers: `str` (default: 'X')
The layer(s) to be used for calculating dispersion. Default is X if there is no spliced layers.
modelFormulaStr: `str`
The model formula used to calculate dispersion parameters. Not used.
min_cells_detected: `int`
The minimum number of cells detected for calculating the dispersion.
removeOutliers: `bool` (default: True)
Whether to remove outliers when performing dispersion fitting.
Returns
-------
adata: :class:`~anndata.AnnData`
A updated annData object with dispFitInfo added to uns attribute as a new key.
"""
import re
mu = None
model_terms = [x.strip() for x in re.compile("~|\\*|\\+").split(modelFormulaStr)]
model_terms = list(set(model_terms) - set([""]))
cds_pdata = adata.obs # .loc[:, model_terms]
cds_pdata["rowname"] = cds_pdata.index.values
layers, disp_tables = disp_calc_helper_NB(adata[:, :], layers, min_cells_detected)
# disp_table['disp'] = np.random.uniform(0, 10, 11)
# disp_table = cds_pdata.apply(disp_calc_helper_NB(adata[:, :], min_cells_detected))
# cds_pdata <- dplyr::group_by_(dplyr::select_(rownames_to_column(pData(cds)), "rowname", .dots=model_terms), .dots=model_terms)
# disp_table <- as.data.frame(cds_pdata %>% do(disp_calc_helper_NB(cds[,.$rowname], cds@expressionFamily, min_cells_detected)))
for ind in range(len(layers)):
layer, disp_table = layers[ind], disp_tables[ind]
if disp_table is None:
raise Exception(
"Parametric dispersion fitting failed, please set a different lowerDetectionLimit"
)
disp_table = disp_table.loc[np.where(disp_table["mu"] != np.nan)[0], :]
res = parametricDispersionFit(disp_table)
fit, coefs, good = res[0], res[1], res[2]
if removeOutliers:
# influence = fit.get_influence().cooks_distance()
# #CD is the distance and p is p-value
# (CD, p) = influence.cooks_distance
CD = cook_dist(fit, 1 / good["mu"][:, None], good)
cooksCutoff = 4 / good.shape[0]
print("Removing ", len(CD[CD > cooksCutoff]), " outliers")
outliers = CD > cooksCutoff
# use CD.index.values? remove genes that lost when doing parameter fitting
lost_gene = set(good.index.values).difference(set(range(len(CD))))
outliers[lost_gene] = True
res = parametricDispersionFit(good.loc[~outliers, :])
fit, coefs = res[0], res[1]
def ans(q):
return coefs[0] + coefs[1] / q
if layer == "X":
adata.uns["dispFitInfo"] = {
"disp_table": good,
"disp_func": ans,
"coefs": coefs,
}
else:
adata.uns[layer + "_dispFitInfo"] = {
"disp_table": good,
"disp_func": ans,
"coefs": coefs,
}
return adata
def SVRs(
adata_ori,
filter_bool=None,
layers="X",
relative_expr=True,
total_szfactor='total_Size_Factor',
min_expr_cells=0,
min_expr_avg=0,
max_expr_avg=0,
svr_gamma=None,
winsorize=False,
winsor_perc=(1, 99.5),
sort_inverse=False,
use_all_genes_cells=False,
):
"""This function is modified from https://github.com/velocyto-team/velocyto.py/blob/master/velocyto/analysis.py
Parameters
----------
adata: :class:`~anndata.AnnData`
AnnData object.
filter_bool: :class:`~numpy.ndarray` (default: None)
A boolean array from the user to select genes for downstream analysis.
layers: `str` (default: 'X')
The layer(s) to be used for calculating dispersion score via support vector regression (SVR). Default is X if there is no spliced layers.
relative_expr: `bool` (default: `True`)
A logic flag to determine whether we need to divide gene expression values first by size factor before run SVR.
total_szfactor: `str` (default: `total_Size_Factor`)
The column name in the .obs attribute that corresponds to the size factor for the total mRNA.
min_expr_cells: `int` (default: `2`)
minimum number of cells that express that gene for it to be considered in the fit.
min_expr_avg: `int` (default: `0`)
The minimum average of genes across cells accepted.
max_expr_avg: `float` (defaul: `20`)
The maximum average of genes across cells accepted before treating house-keeping/outliers for removal.
svr_gamma: `float` or None (default: `None`)
the gamma hyper-parameter of the SVR.
winsorize: `bool` (default: `False`)
Wether to winsorize the data for the cv vs mean model.
winsor_perc: `tuple` (default: `(1, 99.5)`)
the up and lower bound of the winsorization.
sort_inverse: `bool` (default: `False`)
if True it sorts genes from less noisy to more noisy (to use for size estimation not for feature selection).
use_all_genes_cells: `bool` (default: `False`)
A logic flag to determine whether all cells and genes should be used for the size factor calculation.
Returns
-------
adata: :class:`~anndata.AnnData`
A updated annData object with `log_m`, `log_cv`, `score` added to .obs columns and `SVR` added to uns attribute
as a new key.
"""
from sklearn.svm import SVR
layers = get_layer_keys(adata_ori, layers)
if use_all_genes_cells:
adata = adata_ori[:, filter_bool].copy() if filter_bool is not None else adata_ori
else:
cell_inds = adata_ori.obs.use_for_pca if 'use_for_pca' in adata_ori.obs.columns else adata_ori.obs.index
filter_list = ['use_for_pca', 'pass_basic_filter']
filter_checker = [i in adata_ori.var.columns for i in filter_list]
which_filter = np.where(filter_checker)[0]
gene_inds = adata_ori.var[filter_list[which_filter[0]]] if len(which_filter) > 0 else adata_ori.var.index
adata = adata_ori[cell_inds, gene_inds].copy()
filter_bool = filter_bool[gene_inds]
for layer in layers:
if layer == "raw":
CM = adata.X.copy() if adata.raw is None else adata.raw
szfactors = (
adata.obs[layer + "_Size_Factor"].values[:, None]
if adata.raw.X is not None
else adata.obs["Size_Factor"].values[:, None]
)
elif layer == "X":
CM = adata.X.copy()
szfactors = adata.obs["Size_Factor"].values[:, None]
elif layer == "protein":
if "protein" in adata.obsm_keys():
CM = adata.obsm["protein"].copy()
szfactors = adata.obs[layer + "_Size_Factor"].values[:, None]
else:
continue
else:
CM = adata.layers[layer].copy()
szfactors = (
adata.obs[layer + "_Size_Factor"].values[:, None]
if layer + "_Size_Factor" in adata.obs.columns
else None
)
if total_szfactor is not None and total_szfactor in adata.obs.keys():
szfactors = (
adata.obs[total_szfactor].values[:, None]
if total_szfactor in adata.obs.columns
else None
)
if szfactors is not None and relative_expr:
if issparse(CM):
sparsefuncs.inplace_row_scale(CM, 1 / szfactors)
else:
CM /= szfactors
if winsorize:
if min_expr_cells <= ((100 - winsor_perc[1]) * CM.shape[0] * 0.01):
min_expr_cells = (
int(np.ceil((100 - winsor_perc[1]) * CM.shape[1] * 0.01)) + 2
)
detected_bool = np.array(
((CM > 0).sum(0) >= min_expr_cells)
& (CM.mean(0) <= max_expr_avg)
& (CM.mean(0) >= min_expr_avg)
).flatten()
valid_CM = CM[:, detected_bool]
if winsorize:
down, up = (
np.percentile(valid_CM.A, winsor_perc, 0)
if issparse(valid_CM)
else np.percentile(valid_CM, winsor_perc, 0)
)
Sfw = (
np.clip(valid_CM.A, down[None, :], up[None, :])
if issparse(valid_CM)
else np.percentile(valid_CM, winsor_perc, 0)
)
mu = Sfw.mean(0)
sigma = Sfw.std(0, ddof=1)
else:
mu = np.array(valid_CM.mean(0)).flatten()
sigma = (
np.array(
np.sqrt(
(valid_CM.multiply(valid_CM).mean(0).A1 - (mu) ** 2)
# * (adata.n_obs)
# / (adata.n_obs - 1)
)
)
if issparse(valid_CM)
else valid_CM.std(0, ddof=1)
)
cv = sigma / mu
log_m = np.array(np.log2(mu)).flatten()
log_cv = np.array(np.log2(cv)).flatten()
log_m[mu == 0], log_cv[mu == 0] = 0, 0
if svr_gamma is None:
svr_gamma = 150.0 / len(mu)
# Fit the Support Vector Regression
clf = SVR(gamma=svr_gamma)
clf.fit(log_m[:, None], log_cv)
fitted_fun = clf.predict
ff = fitted_fun(log_m[:, None])
score = log_cv - ff
if sort_inverse:
score = -score
prefix = "" if layer == "X" else layer + "_"
(
adata.var[prefix + "log_m"],
adata.var[prefix + "log_cv"],
adata.var[prefix + "score"],
) = (np.nan, np.nan, -np.inf)
(
adata.var.loc[detected_bool, prefix + "log_m"],
adata.var.loc[detected_bool, prefix + "log_cv"],
adata.var.loc[detected_bool, prefix + "score"],
) = (
np.array(log_m).flatten(),
np.array(log_cv).flatten(),
np.array(score).flatten(),
)
key = (
"velocyto_SVR"
if layer == "raw" or layer == "X"
else layer + "_velocyto_SVR"
)
adata_ori.uns[key] = {"SVR": fitted_fun}
adata_ori = merge_adata_attrs(adata_ori, adata, attr='var')
return adata_ori
def filter_cells(
adata,
filter_bool=None,
layer="all",
keep_filtered=False,
min_expr_genes_s=50,
min_expr_genes_u=25,
min_expr_genes_p=1,
max_expr_genes_s=np.inf,
max_expr_genes_u=np.inf,
max_expr_genes_p=np.inf,
shared_count=None,
):
"""Select valid cells based on a collection of filters.
Parameters
----------
adata: :class:`~anndata.AnnData`
AnnData object.
filter_bool: :class:`~numpy.ndarray` (default: `None`)
A boolean array from the user to select cells for downstream analysis.
layer: `str` (default: `all`)
The data from a particular layer (include X) used for feature selection.
keep_filtered: `bool` (default: `False`)
Whether to keep cells that don't pass the filtering in the adata object.
min_expr_genes_s: `int` (default: `50`)
Minimal number of genes with expression for a cell in the data from the spliced layer (also used for X).
min_expr_genes_u: `int` (default: `25`)
Minimal number of genes with expression for a cell in the data from the unspliced layer.
min_expr_genes_p: `int` (default: `1`)
Minimal number of genes with expression for a cell in the data from in the protein layer.
max_expr_genes_s: `float` (default: `np.inf`)
Maximal number of genes with expression for a cell in the data from the spliced layer (also used for X).
max_expr_genes_u: `float` (default: `np.inf`)
Maximal number of genes with expression for a cell in the data from the unspliced layer.
max_expr_genes_p: `float` (default: `np.inf`)
Maximal number of protein with expression for a cell in the data from the protein layer.
shared_count: `float` (default: `30`)
The minimal shared number of counts for each cell across genes between layers.
Returns
-------
adata: :class:`~anndata.AnnData`
An updated AnnData object with use_for_pca as a new column in obs to indicate the selection of cells for
downstream analysis. adata will be subsetted with only the cells pass filtering if keep_filtered is set to be
False.
"""
detected_bool = np.ones(adata.X.shape[0], dtype=bool)
detected_bool = (detected_bool) & (
((adata.X > 0).sum(1) >= min_expr_genes_s)
& ((adata.X > 0).sum(1) <= max_expr_genes_s)
).flatten()
if ("spliced" in adata.layers.keys()) & (layer == "spliced" or layer == "all"):
detected_bool = (
detected_bool
& (
((adata.layers["spliced"] > 0).sum(1) >= min_expr_genes_s)
& ((adata.layers["spliced"] > 0).sum(1) <= max_expr_genes_s)
).flatten()
)
if ("unspliced" in adata.layers.keys()) & (layer == "unspliced" or layer == "all"):
detected_bool = (
detected_bool
& (
((adata.layers["unspliced"] > 0).sum(1) >= min_expr_genes_u)
& ((adata.layers["unspliced"] > 0).sum(1) <= max_expr_genes_u)
).flatten()
)
if ("protein" in adata.obsm.keys()) & (layer == "protein" or layer == "all"):
detected_bool = (
detected_bool
& (
((adata.obsm["protein"] > 0).sum(1) >= min_expr_genes_p)
& ((adata.obsm["protein"] > 0).sum(1) <= max_expr_genes_p)
).flatten()
)
if shared_count is not None:
layers = get_layer_keys(adata, layer, False)
detected_bool = detected_bool & get_shared_counts(
adata, layers, shared_count, "cell"
)
filter_bool = (
filter_bool & detected_bool if filter_bool is not None else detected_bool
)
filter_bool = np.array(filter_bool).flatten()
if keep_filtered:
adata.obs["use_for_pca"] = filter_bool
else:
adata._inplace_subset_obs(filter_bool)
adata.obs["use_for_pca"] = True
return adata
def filter_genes_by_clusters_(
adata, cluster, min_avg_U=0.02, min_avg_S=0.08, size_limit=40
):
"""Prepare filtering genes on the basis of cluster-wise expression threshold
This function is taken from velocyto in order to reproduce velocyto's DentateGyrus notebook.
Arguments
---------
adata: :class:`~anndata.AnnData`
AnnData object.
cluster: `str`
A column in the adata.obs attribute which will be used for cluster specific expression filtering.
min_avg_U: float
Include genes that have unspliced average bigger than `min_avg_U` in at least one of the clusters
min_avg_S: float
Include genes that have spliced average bigger than `min_avg_U` in at least one of the clusters
Note: the two conditions are combined by and "&" logical operator.
Returns
-------
Nothing but it creates the attribute
clu_avg_selected: np.ndarray bool
The gene cluster that is selected
To perform the filtering use the method `filter_genes`
"""
U, S, cluster_uid = (
adata.layers["unspliced"],
adata.layers["spliced"],
adata.obs[cluster],
)
cluster_uid, cluster_ix = np.unique(cluster_uid, return_inverse=True)
U_avgs, S_avgs = clusters_stats(
U, S, cluster_uid, cluster_ix, size_limit=size_limit
)
clu_avg_selected = (U_avgs.max(1) > min_avg_U) & (S_avgs.max(1) > min_avg_S)
return clu_avg_selected
def filter_genes(
adata,
filter_bool=None,
layer="all",
min_cell_s=1,
min_cell_u=1,
min_cell_p=1,
min_avg_exp_s=1e-10,
min_avg_exp_u=0,
min_avg_exp_p=0,
max_avg_exp=np.infty,
min_count_s=0,
min_count_u=0,
min_count_p=0,
shared_count=30,
):
"""Basic filter of genes based a collection of expression filters.
Parameters
----------
adata: :class:`~anndata.AnnData`
AnnData object.
filter_bool: :class:`~numpy.ndarray` (default: None)
A boolean array from the user to select genes for downstream analysis.
layer: `str` (default: `X`)
The data from a particular layer (include X) used for feature selection.
min_cell_s: `int` (default: `5`)
Minimal number of cells with expression for the data in the spliced layer (also used for X).
min_cell_u: `int` (default: `5`)
Minimal number of cells with expression for the data in the unspliced layer.
min_cell_p: `int` (default: `5`)
Minimal number of cells with expression for the data in the protein layer.
min_avg_exp_s: `float` (default: `1e-2`)
Minimal average expression across cells for the data in the spliced layer (also used for X).
min_avg_exp_u: `float` (default: `1e-4`)
Minimal average expression across cells for the data in the unspliced layer.
min_avg_exp_p: `float` (default: `1e-4`)
Minimal average expression across cells for the data in the protein layer.
max_avg_exp: `float` (default: `100`.)
Maximal average expression across cells for the data in all layers (also used for X).
min_cell_s: `int` (default: `5`)
Minimal number of counts (UMI/expression) for the data in the spliced layer (also used for X).
min_cell_u: `int` (default: `5`)
Minimal number of counts (UMI/expression) for the data in the unspliced layer.
min_cell_p: `int` (default: `5`)
Minimal number of counts (UMI/expression) for the data in the protein layer.
shared_count: `float` (default: `30`)
The minimal shared number of counts for each genes across cell between layers.
Returns
-------
adata: :class:`~anndata.AnnData`
An updated AnnData object with use_for_pca as a new column in .var attributes to indicate the selection of genes for
downstream analysis. adata will be subsetted with only the genes pass filter if keep_unflitered is set to be False.
"""
detected_bool = np.ones(adata.shape[1], dtype=bool)
detected_bool = (detected_bool) & np.array(
((adata.X > 0).sum(0) >= min_cell_s)
& (adata.X.mean(0) >= min_avg_exp_s)
& (adata.X.mean(0) <= max_avg_exp)
& (adata.X.sum(0) >= min_count_s)
).flatten()
if "spliced" in adata.layers.keys() and (layer == "spliced" or layer == "all"):
detected_bool = (
detected_bool
& np.array(
((adata.layers["spliced"] > 0).sum(0) >= min_cell_s)
& (adata.layers["spliced"].mean(0) >= min_avg_exp_s)
& (adata.layers["spliced"].mean(0) <= max_avg_exp)
& (adata.layers["spliced"].sum(0) >= min_count_s)
).flatten()
)
if "unspliced" in adata.layers.keys() and (layer == "unspliced" or layer == "all"):
detected_bool = (
detected_bool
& np.array(
((adata.layers["unspliced"] > 0).sum(0) >= min_cell_u)
& (adata.layers["unspliced"].mean(0) >= min_avg_exp_u)
& (adata.layers["unspliced"].mean(0) <= max_avg_exp)
& (adata.layers["unspliced"].sum(0) >= min_count_u)
).flatten()
)
if shared_count is not None:
layers = get_layer_keys(adata, "all", False)
detected_bool = detected_bool & get_shared_counts(
adata, layers, shared_count, "gene"
)
############################## The following code need to be updated ##############################
# just remove genes that are not following the protein criteria
if "protein" in adata.obsm.keys() and layer == "protein":
detected_bool = (
detected_bool
& np.array(
((adata.obsm["protein"] > 0).sum(0) >= min_cell_p)
& (adata.obsm["protein"].mean(0) >= min_avg_exp_p)
& (adata.obsm["protein"].mean(0) <= max_avg_exp)
& (adata.layers["protein"].sum(0) >= min_count_p)
).flatten()
)
filter_bool = (
filter_bool & detected_bool if filter_bool is not None else detected_bool
)
adata.var["pass_basic_filter"] = np.array(filter_bool).flatten()
return adata
def select_genes(
adata,
layer="X",
total_szfactor='total_Size_Factor',
keep_filtered=True,
sort_by="SVR",
n_top_genes=2000,
SVRs_kwargs={},
):
"""Select feature genes based on a collection of filters.
Parameters
----------
adata: :class:`~anndata.AnnData`
AnnData object.
layer: `str` (default: `X`)
The data from a particular layer (include X) used for feature selection.
total_szfactor: `str` (default: `total_Size_Factor`)
The column name in the .obs attribute that corresponds to the size factor for the total mRNA.
keep_filtered: `bool` (default: `True`)
Whether to keep genes that don't pass the filtering in the adata object.
sort_by: `str` (default: `SVR`)
Which soring method, either SVR, dispersion or Gini index, to be used to select genes.
n_top_genes: `int` (default: `int`)
How many top genes based on scoring method (specified by sort_by) will be selected as feature genes.
Returns
-------
adata: :class:`~anndata.AnnData`
An updated AnnData object with use_for_pca as a new column in .var attributes to indicate the selection of genes for
downstream analysis. adata will be subsetted with only the genes pass filter if keep_unflitered is set to be False.
"""
filter_bool = adata.var["pass_basic_filter"] if "pass_basic_filter" in adata.var.columns \
else np.ones(adata.shape[1], dtype=bool)
if adata.shape[1] <= n_top_genes:
filter_bool = np.ones(adata.shape[1], dtype=bool)
else:
if sort_by == "dispersion":
table = topTable(adata, layer, mode="dispersion")
valid_table = table.query("dispersion_empirical > dispersion_fit")
valid_table = valid_table.loc[
set(adata.var.index[filter_bool]).intersection(valid_table.index), :
]
gene_id = np.argsort(-valid_table.loc[:, "dispersion_empirical"])[
:n_top_genes
]
gene_id = valid_table.iloc[gene_id, :].index
filter_bool = adata.var.index.isin(gene_id)
elif sort_by == "gini":
table = topTable(adata, layer, mode="gini")
valid_table = table.loc[filter_bool, :]
gene_id = np.argsort(-valid_table.loc[:, "gini"])[:n_top_genes]
gene_id = valid_table.index[gene_id]
filter_bool = gene_id.isin(adata.var.index)
elif sort_by == "SVR":
SVRs_args = {
"min_expr_cells": 0,
"min_expr_avg": 0,
"max_expr_avg": np.inf,
"svr_gamma": None,
"winsorize": False,
"winsor_perc": (1, 99.5),
"sort_inverse": False,
}
SVRs_args = update_dict(SVRs_args, SVRs_kwargs)
adata = SVRs(
adata,
layers=layer,
total_szfactor=total_szfactor,
filter_bool=filter_bool,
**SVRs_args
)
filter_bool = get_svr_filter(adata, layer=layer, n_top_genes=n_top_genes, return_adata=False)
if keep_filtered:
adata.var["use_for_pca"] = filter_bool
else:
adata._inplace_subset_var(filter_bool)
adata.var["use_for_pca"] = True
return adata
[docs]def recipe_monocle(
adata,
normalized=None,
layer=None,
total_layers=None,
genes_to_use=None,
method="pca",
num_dim=30,
sz_method='median',
norm_method=None,
pseudo_expr=1,
feature_selection="SVR",
n_top_genes=2000,
relative_expr=True,
keep_filtered_cells=True,
keep_filtered_genes=True,
scopes=None,
fc_kwargs=None,
fg_kwargs=None,
sg_kwargs=None,
):
"""This function is partly based on Monocle R package (https://github.com/cole-trapnell-lab/monocle3).
Parameters
----------
adata: :class:`~anndata.AnnData`
AnnData object.
normalized: `None` or `bool` (default: `None`)
If you already normalized your data (or run recipe_monocle already), set this to be `True` to avoid renormalizing your data.
By default it is set to be `None` and the first 20 values of adata.X (if adata.X is sparse) or its first column will be checked to
determine whether you already normalized your data. This only works for UMI based or read-counts data.
layer: str (default: `None`)
The layer(s) to be normalized. Default is all, including RNA (X, raw) or spliced, unspliced, protein, etc.
total_layers: bool, list or None (default `None`)
The layer(s) that can be summed up to get the total mRNA. for example, ["spliced", "unspliced"], ["uu", "ul",
"su", "sl"] or ["total"], etc. If total_layers is `True`, total_layers will be set to be `total` or
["uu", "ul", "su", "sl"] depends on whether you have labeling but no splicing or labeling and splicing data.
genes_to_use: `list` (default: `None`)
A list genes of gene names that will be used to set as the feature genes for downstream analysis.
method: `str` (default: `log`)
The linear dimension reduction methods to be used.
num_dim: `int` (default: `30`)
The number of linear dimensions reduced to.
sz_method: `str` (default: `mean-geometric-mean-total`)
The method used to calculate the expected total reads / UMI used in size factor calculation.
Only `mean-geometric-mean-total` / `geometric` and `median` are supported. When `median` is used, `locfunc` will be replaced with
`np.nanmedian`.
norm_method: `function` or None (default: function `None`)
The method to normalize the data. Can be any numpy function or `Freeman_Tukey`. By default, only .X will be
size normalized and log1p transformed while data in other layers will only be size factor normalized.
pseudo_expr: `int` (default: `1`)
A pseudocount added to the gene expression value before log/log2 normalization.
feature_selection: `str` (default: `SVR`)
Which soring method, either dispersion, SVR or Gini index, to be used to select genes.
n_top_genes: `int` (default: `2000`)
How many top genes based on scoring method (specified by sort_by) will be selected as feature genes.
relative_expr: `bool` (default: `True`)
A logic flag to determine whether we need to divide gene expression values first by size factor before normalization.
keep_filtered_cells: `bool` (default: `True`)
Whether to keep genes that don't pass the filtering in the adata object.
keep_filtered_genes: `bool` (default: `True`)
Whether to keep genes that don't pass the filtering in the adata object.
scopes: `str`, list-like` or `None` (default: `None`)
Scopes are needed when you use non-official gene name as your gene indices (or adata.var_name). This
arugument corresponds to type of types of identifiers, either a list or a comma-separated fields to specify
type of input qterms, e.g. “entrezgene”, “entrezgene,symbol”, [“ensemblgene”, “symbol”]. Refer to official
MyGene.info docs (https://docs.mygene.info/en/latest/doc/query_service.html#available_fields) for full list
of fields.
fc_kwargs: `dict` or None (default: `None`)
Other Parameters passed into the filter_genes function.
fg_kwargs: `dict` or None (default: `None`)
Other Parameters passed into the filter_cells function.
sg_kwargs: `dict` or None (default: `None`)
Other Parameters passed into the select_cells function.
Returns
-------
adata: :class:`~anndata.AnnData`
A updated anndata object that are updated with Size_Factor, normalized expression values, X and reduced dimensions, etc.
"""
n_cells, n_genes = adata.n_obs, adata.n_vars
if np.all(adata.var_names.str.startswith('ENS')) or scopes is not None:
prefix = adata.var_names[0]
if scopes is None:
if prefix[:4] == 'ENSG' or prefix[:7] == 'ENSMUSG':
scopes = 'ensembl.gene'
elif prefix[:4] == 'ENST' or prefix[:7] == 'ENSMUST':
scopes = 'ensembl.transcript'
else:
raise Exception('Your adata object uses non-official gene names as gene index. \n'
'Dynamo finds those IDs are neither from ensembl.gene or ensembl.transcript and thus cannot '
'convert them automatically. \n'
'Please pass the correct scopes or first convert the ensemble ID to gene short name '
'(for example, using mygene package). \n'
'See also dyn.pp.convert2gene_symbol')
adata.var['query'] = [i.split('.')[0] for i in adata.var.index]
if scopes is str:
adata.var[scopes] = adata.var.index
else:
adata.var['scopes'] = adata.var.index
warnings.warn('Your adata object uses non-official gene names as gene index. \n'
'Dynamo is converting those names to official gene names.')
official_gene_df = convert2gene_symbol(adata.var_names, scopes)
merge_df = adata.var.merge(official_gene_df, left_on='query', right_on='query', how='left').set_index(
adata.var.index)
adata.var = merge_df
valid_ind = np.where(merge_df['notfound'] != True)[0]
adata._inplace_subset_var(valid_ind)
adata.var.index = adata.var['symbol'].values.copy()
if norm_method == 'Freeman_Tukey': norm_method = Freeman_Tukey
basic_stats(adata)
has_splicing, has_labeling, _ = detect_datatype(adata)
if has_splicing and has_labeling and type(total_layers) != list:
total_layers = ['uu', 'ul', 'su', 'sl'] if total_layers else None
elif has_labeling and not has_splicing and type(total_layers) != list:
total_layers = ['total'] if total_layers else None
adata = unique_var_obs_adata(adata)
adata = layers2csr(adata)
adata = collapse_adata(adata)
_szFactor, _logged = False, False
if normalized is None:
if 'raw_data' in adata.uns_keys():
_szFactor, _logged = not adata.uns['raw_data'], not adata.uns['raw_data']
else:
# automatically detect whether the data is size-factor normalized -- no integers (only works for readcounts / UMI based data).
_szFactor = not np.allclose(
(adata.X.data[:20] if issparse(adata.X) else adata.X[:, 0]) % 1,
0,
atol=1e-3,
)
# check whether total UMI is the same -- if not the same, logged
if _szFactor:
_logged = not np.allclose(
np.sum(
adata.X.sum(1)[np.random.choice(adata.n_obs, 10)]
- adata.X.sum(1)[0]
),
0,
atol=1e-1,
)
# filter bad cells
filter_cells_kwargs = {
"filter_bool": None,
"layer": "all",
"min_expr_genes_s": min(50, 0.01 * n_genes),
"min_expr_genes_u": min(25, 0.01 * n_genes),
"min_expr_genes_p": min(2, 0.01 * n_genes),
"max_expr_genes_s": np.inf,
"max_expr_genes_u": np.inf,
"max_expr_genes_p": np.inf,
"shared_count": None,
}
if fc_kwargs is not None:
filter_cells_kwargs.update(fc_kwargs)
adata = filter_cells(
adata, keep_filtered=keep_filtered_cells, **filter_cells_kwargs
)
filter_genes_kwargs = {
"filter_bool": None,
"layer": "all",
"min_cell_s": max(5, 0.01 * n_cells),
"min_cell_u": max(5, 0.005 * n_cells),
"min_cell_p": max(5, 0.005 * n_cells),
"min_avg_exp_s": 0,
"min_avg_exp_u": 0,
"min_avg_exp_p": 0,
"max_avg_exp": np.inf,
"min_count_s": 0,
"min_count_u": 0,
"min_count_p": 0,
"shared_count": 30,
}
if fg_kwargs is not None:
filter_genes_kwargs.update(fg_kwargs)
# set pass_basic_filter for genes
adata = filter_genes(
adata,
**filter_genes_kwargs,
)
# calculate sz factor
if not _szFactor or "Size_Factor" not in adata.obs_keys():
adata = szFactor(adata, total_layers=total_layers)
if feature_selection.lower() == "dispersion":
adata = Dispersion(adata)
# set use_for_pca (use basic_filtered data)
select_genes_dict = {
"min_expr_cells": 0,
"min_expr_avg": 0,
"max_expr_avg": np.inf,
"svr_gamma": None,
"winsorize": False,
"winsor_perc": (1, 99.5),
"sort_inverse": False,
}
if sg_kwargs is not None:
select_genes_dict.update(sg_kwargs)
if genes_to_use is None:
pass_basic_filter_num = adata.var.pass_basic_filter.sum()
if pass_basic_filter_num < n_top_genes:
warnings.warn(f'only {pass_basic_filter_num} genes passed basic filtering, but you requested {n_top_genes} '
f'genes for feature selection. Try lowering the gene selection stringency: '
f'{select_genes_dict}')
adata = select_genes(
adata,
sort_by=feature_selection,
n_top_genes=n_top_genes,
keep_filtered=keep_filtered_genes,
SVRs_kwargs=select_genes_dict,
)
else:
adata.var["use_for_pca"] = adata.var.index.isin(genes_to_use)
if not keep_filtered_genes:
adata = adata[:, adata.var["use_for_pca"]]
# normalized data based on sz factor
if not _logged:
total_szfactor = "total_Size_Factor" if total_layers is not None else None
adata = normalize_expr_data(
adata,
total_szfactor=total_szfactor,
norm_method=norm_method,
pseudo_expr=pseudo_expr,
relative_expr=relative_expr,
keep_filtered=keep_filtered_genes,
sz_method=sz_method,
)
else:
layers = get_layer_keys(adata, "all")
for layer in layers:
adata.layers["X_" + layer] = adata.layers[layer].copy()
adata.uns["pp_norm_method"] = None
# only use genes pass filter (based on use_for_pca) to perform dimension reduction.
if layer is None:
CM = adata.X[:, adata.var.use_for_pca.values]
else:
if layer == "X":
CM = adata.X[:, adata.var.use_for_pca.values]
elif layer == "protein":
CM = adata.obsm["X_protein"]
else:
CM = adata.layers["X_" + layer][:, adata.var.use_for_pca.values]
cm_genesums = CM.sum(axis=0)
valid_ind = np.logical_and(np.isfinite(cm_genesums), cm_genesums != 0)
valid_ind = np.array(valid_ind).flatten()
adata.var.iloc[
np.where(adata.var.use_for_pca)[0][~valid_ind],
adata.var.columns.tolist().index("use_for_pca"),
] = False
CM = CM[:, valid_ind]
if method == "pca":
adata, fit, _ = pca(adata, CM, num_dim, "X_" + method.lower())
adata.uns["explained_variance_ratio_"] = fit.explained_variance_ratio_[1:]
adata.obsm['X'] = adata.obsm["X_" + method.lower()]
elif method == "ica":
fit = FastICA(
num_dim, algorithm="deflation", tol=5e-6, fun="logcosh", max_iter=1000
)
reduce_dim = fit.fit_transform(CM.toarray())
adata.obsm["X_" + method.lower()] = reduce_dim
adata.obsm['X'] = adata.obsm["X_" + method.lower()]
adata.uns[method + "_fit"], adata.uns["feature_selection"] = fit, feature_selection
# calculate NTR for every cell:
ntr, var_ntr = NTR(adata)
if ntr is not None:
adata.obs['ntr'] = ntr
adata.var['ntr'] = var_ntr
try:
cell_cycle_scores(adata)
except Exception:
warnings.warn('Dynamo is not able to perform cell cycle staging for you automatically. \n'
'Since dyn.pl.phase_diagram in dynamo by default color cells by its cell-cycle stage, \n'
'you need to set color argument accordingly if confronting errors related to this.')
if 'raw_data' in adata.uns_keys():
adata.uns['raw_data'] = False
return adata
def recipe_velocyto(
adata,
total_layers=None,
method="pca",
num_dim=30,
norm_method=None,
pseudo_expr=1,
feature_selection="SVR",
n_top_genes=2000,
cluster="Clusters",
relative_expr=True,
keep_filtered_genes=True,
):
"""This function is adapted from the velocyto's DentateGyrus notebook.
.
Parameters
----------
adata: :class:`~anndata.AnnData`
AnnData object.
total_layers: list or None (default `None`)
The layer(s) that can be summed up to get the total mRNA. for example, ["spliced", "unspliced"], ["uu", "ul", "su", "sl"] or ["new", "old"], etc.
method: `str` (default: `log`)
The linear dimension reduction methods to be used.
num_dim: `int` (default: `50`)
The number of linear dimensions reduced to.
norm_method: `function`, `str` or `None` (default: function `None`)
The method to normalize the data.
pseudo_expr: `int` (default: `1`)
A pseudocount added to the gene expression value before log/log2 normalization.
feature_selection: `str` (default: `SVR`)
Which soring method, either dispersion, SVR or Gini index, to be used to select genes.
n_top_genes: `int` (default: `2000`)
How many top genes based on scoring method (specified by sort_by) will be selected as feature genes.
cluster: `str`
A column in the adata.obs attribute which will be used for cluster specific expression filtering.
relative_expr: `bool` (default: `True`)
A logic flag to determine whether we need to divide gene expression values first by size factor before normalization.
keep_filtered_genes: `bool` (default: `True`)
Whether to keep genes that don't pass the filtering in the adata object.
Returns
-------
adata: :class:`~anndata.AnnData`
A updated anndata object that are updated with Size_Factor, normalized expression values, X and reduced dimensions, etc.
"""
adata = szFactor(adata, method="mean", total_layers=total_layers)
initial_Ucell_size = adata.layers["unspliced"].sum(1)
filter_bool = initial_Ucell_size > np.percentile(initial_Ucell_size, 0.4)
adata = filter_cells(adata, filter_bool=np.array(filter_bool).flatten())
filter_bool = filter_genes(adata, min_cell_s=30, min_count_s=40, shared_count=None)
adata = adata[:, filter_bool]
adata = SVRs(
adata, layers=["spliced"], min_expr_cells=2, max_expr_avg=35, min_expr_avg=0
)
filter_bool = get_svr_filter(adata, layer="spliced", n_top_genes=n_top_genes)
adata = adata[:, filter_bool]
filter_bool_gene = filter_genes(
adata,
min_cell_s=0,
min_count_s=0,
min_count_u=25,
min_cell_u=20,
shared_count=None,
)
filter_bool_cluster = filter_genes_by_clusters_(
adata, min_avg_S=0.08, min_avg_U=0.01, cluster=cluster
)
adata = adata[:, filter_bool_gene & filter_bool_cluster]
adata = normalize_expr_data(
adata,
total_szfactor=None,
norm_method=norm_method,
pseudo_expr=pseudo_expr,
relative_expr=relative_expr,
keep_filtered=keep_filtered_genes,
)
CM = adata.X
cm_genesums = CM.sum(axis=0)
valid_ind = np.logical_and(np.isfinite(cm_genesums), cm_genesums != 0)
valid_ind = np.array(valid_ind).flatten()
adata.var.use_for_pca[np.where(adata.var.use_for_pca)[0][~valid_ind]] = False
CM = CM[:, valid_ind]
if method == "pca":
adata, fit, _ = pca(adata, CM, num_dim, "X_" + method.lower())
# adata.obsm['X_' + method.lower()] = reduce_dim
elif method == "ica":
cm_genesums = CM.sum(axis=0)
valid_ind = (np.isfinite(cm_genesums)) + (cm_genesums != 0)
valid_ind = np.array(valid_ind).flatten()
CM = CM[:, valid_ind]
fit = FastICA(
num_dim, algorithm="deflation", tol=5e-6, fun="logcosh", max_iter=1000
)
reduce_dim = fit.fit_transform(CM.toarray())
adata.obsm["X_" + method.lower()] = reduce_dim
add_noise_to_duplicates(adata, method.lower())
adata.uns[method + "_fit"], adata.uns["feature_selection"] = fit, feature_selection
# calculate NTR for every cell:
ntr = NTR(adata)
if ntr is not None: adata.obs['ntr'] = ntr
return adata