Source code for dynamo.tools.markers

from typing import List, Optional, Tuple, Union

try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal

import warnings
from collections import Counter

import numpy as np
import pandas as pd
import patsy
import statsmodels.api as sm
import statsmodels.formula.api as smf
from anndata import AnnData
from patsy import bs, cr, dmatrix
from scipy import stats
from scipy.sparse import issparse
from scipy.stats import mannwhitneyu
from statsmodels.genmod.generalized_linear_model import GLMResultsWrapper
from statsmodels.sandbox.stats.multicomp import multipletests
from tqdm import tqdm

from ..dynamo_logger import (
    main_critical,
    main_exception,
    main_info,
    main_tqdm,
    main_warning,
)
from ..preprocessing.transform import _Freeman_Tukey
from ..tools.connectivity import generate_neighbor_keys, check_and_recompute_neighbors
from .utils import fdr, fetch_X_data


[docs]def moran_i( adata: AnnData, X_data: Optional[np.ndarray] = None, genes: Optional[List[str]] = None, layer: Optional[str] = None, weighted: bool = True, assumption: Literal["permutation", "normality", "randomization"] = "permutation", local_moran: bool = False, ) -> AnnData: """Identify genes with strong spatial autocorrelation with Moran's I test. This can be used to identify genes that are potentially related to critical dynamic process. Moran's I test is first introduced in single cell genomics analysis in (Cao, et al. 2019). Note that moran_i supports performing spatial autocorrelation analysis for any layer or normalized data in your adata object. That is you can either use the total, new, unspliced or velocity, etc. for the Moran's I analysis. Args: adata: An AnnData object. X_data: The user supplied data that will be used for Moran's I calculation directly. Defaults to None. genes: The list of genes that will be used to subset the data for dimension reduction and clustering. If `None`, all genes will be used. Defaults to None. layer: The layer that will be used to retrieve data for dimension reduction and clustering. If `None`, .X is used. Defaults to None. weighted: Whether to consider edge weights in the spatial weights graph. Defaults to True. assumption: Assumption of the Moran's I test. Inference for Moran's I is based on a null hypothesis of spatial randomness. The distribution of the statistic under the null can be derived using either an assumption of normality (independent normal random variates), or so-called randomization (i.e., each value is equally likely to occur at any location). An alternative to an analytical derivation is a computational approach based on permutation. This calculates a reference distribution for the statistic under the null hypothesis of spatial randomness by randomly permuting the observed values over the locations. The statistic is computed for each of these randomly reshuffled data sets, which yields a reference distribution. Defaults to "permutation". local_moran: Whether to also calculate local Moran's I. Defaults to False. Raises: ImportError: Package pysal not installed. ValueError: `X_data` is provided but `genes` does not correspond to its columns. Returns: The updated AnnData object with a new key `'Moran_' + type` in the .uns attribute, storing the Moran' I test results. """ try: from pysal import explore, lib except ImportError: raise ImportError( "You need to install the package `pysal`." "Please install via pip install pysal. See more details at https://pypi.org/project/pysal/" ) if X_data is None: genes, X_data = fetch_X_data(adata, genes, layer) else: if genes is None or len(genes) != X_data.shape[1]: raise ValueError( f"When providing X_data, a list of genes name that corresponds to the columns of X_data " f"must be provided" ) cell_num, gene_num = X_data.shape embedding_key = "X_umap" if layer is None else layer + "_umap" neighbor_result_prefix = "" if layer is None else layer conn_key, dist_key, neighbor_key = generate_neighbor_keys(neighbor_result_prefix) if neighbor_key not in adata.uns.keys(): main_warning( f"Neighbor graph is required for Moran's I test. No neighbor_key {neighbor_key} exists in the data. " f"Running reduceDimension which will generate the neighbor graph and the associated low dimension" f"embedding {embedding_key}. " ) from .dimension_reduction import reduceDimension adata = reduceDimension(adata, X_data=X_data, layer=layer) check_and_recompute_neighbors(adata, result_prefix=neighbor_result_prefix) neighbor_graph = adata.obsp[conn_key] # convert a sparse adjacency matrix to a dictionary adj_dict = {i: row.indices for i, row in enumerate(neighbor_graph)} if weighted: weight_dict = {i: row.data for i, row in enumerate(neighbor_graph)} W = lib.weights.W(adj_dict, weight_dict) else: W = lib.weights.W(adj_dict) sparse = issparse(X_data) Moran_I, p_value, statistics = ( [None] * gene_num, [None] * gene_num, [None] * gene_num, ) if local_moran: l_moran = np.zeros(X_data.shape) for cur_g in tqdm(range(gene_num), desc="Moran’s I Global Autocorrelation Statistic"): cur_X = X_data[:, cur_g].A if sparse else X_data[:, cur_g] mbi = explore.esda.moran.Moran(cur_X, W, two_tailed=False) Moran_I[cur_g] = mbi.I p_value[cur_g] = ( mbi.p_sim if assumption == "permutation" else mbi.p_norm if assumption == "normality" else mbi.p_rand ) statistics[cur_g] = ( mbi.z_sim if assumption == "permutation" else mbi.z_norm if assumption == "normality" else mbi.z_sim ) if local_moran: l_moran[:, cur_g] = explore.esda.moran.Moran_Local(cur_X, W).Is Moran_res = pd.DataFrame( { "moran_i": Moran_I, "moran_p_val": p_value, "moran_q_val": fdr(np.array(p_value)), "moran_z": statistics, }, index=genes, ) adata.var = adata.var.merge(Moran_res, left_index=True, right_index=True, how="left") if local_moran: adata.uns["local_moran"] = l_moran return adata
[docs]def find_group_markers( adata: AnnData, group: str, genes: Optional[List[str]] = None, layer: Optional[str] = None, exp_frac_thresh: Optional[float] = None, log2_fc_thresh: Optional[float] = None, qval_thresh: float = 0.05, de_frequency: int = 1, subset_control_vals: Optional[bool] = None, ) -> AnnData: """Find marker genes for each group of cells based on gene expression or velocity values as specified by the layer. Tests each gene for differential expression between cells in one group to cells from all other groups via Mann-Whitney U test. It also calculates the fraction of cells with non-zero expression, log 2-fold changes as well as the specificity (calculated as 1 - Jessen-Shannon distance between the distribution of percentage of cells with expression across all groups to the hypothetical perfect distribution in which only the test group of cells has expression). In addition, Rank-biserial correlation (rbc) and qval are calculated. The rank biserial correlation is used to assess the relationship between a dichotomous categorical variable and an ordinal variable. The rank biserial test is very similar to the non-parametric Mann-Whitney U test that is used to compare two independent groups on an ordinal variable. Mann-Whitney U tests are preferable to rank biserial correlations when comparing independent groups. Rank biserial correlations can only be used with dichotomous (two levels) categorical variables. qval is calculated using Benjamini-Hochberg adjustment. Note that this function is designed in a general way so that you can either use the total, new, unspliced or velocity, etc. to identify differentially expressed genes. This function is adapted from https://github.com/KarlssonG/nabo/blob/master/nabo/_marker.py and Monocle 3alpha. Args: adata: An AnnData object. group: The column key/name that identifies the grouping information (for example, clusters that correspond to different cell types or different time points) of cells. This will be used for calculating group-specific genes. genes: The list of genes that will be used to subset the data for dimension reduction and clustering. If `None`, all genes will be used. Defaults to None. layer: The layer that will be used to retrieve data for dimension reduction and clustering. If `None`, .X is used. Defaults to None. exp_frac_thresh: The minimum percentage of cells with expression for a gene to proceed differential expression test. If `layer` is not `velocity` related (i.e. `velocity_S`), `exp_frac_thresh` by default is set to be 0.1, otherwise 0. Defaults to None. log2_fc_thresh: The minimal threshold of log2 fold change for a gene to proceed differential expression test. If `layer` is not `velocity` related (i.e. `velocity_S`), `log2_fc_thresh` by default is set to be 1, otherwise 0. Defaults to None. qval_thresh: The minimal threshold of qval to be considered as significant genes. Defaults to 0.05. de_frequency: Minimum number of clusters against a gene should be significantly differentially expressed for it to qualify as a marker. Defaults to 1. subset_control_vals: Whether to subset the top ranked control values. When `subset_control_vals = None`, this is subset to be `True` when layer is not related to either `velocity` related or `acceleration` or `curvature` related layers and `False` otherwise. When layer is not related to either `velocity` related or `acceleration` or `curvature` related layers used, the control values will be sorted by absolute values. Defaults to None. Raises: ValueError: Gene list does not overlap with genes in adata. ValueError: `group` is invalid. ValueError: .obs[group] does not contain enough number of groups. Returns: An updated `~anndata.AnnData` with a new property `cluster_markers` in the .uns attribute, which includes a concated pandas DataFrame of the differential expression analysis result for all groups and a dictionary where keys are cluster numbers and values are lists of marker genes for the corresponding clusters. """ if layer is None or not (layer.startswith("velocity") or layer in ["acceleration", "curvature"]): exp_frac_thresh = 0.1 if exp_frac_thresh is None else exp_frac_thresh log2_fc_thresh = 1 if log2_fc_thresh is None else log2_fc_thresh subset_control_vals = True if subset_control_vals is None else subset_control_vals else: exp_frac_thresh = 0 if exp_frac_thresh is None else exp_frac_thresh log2_fc_thresh = None subset_control_vals = False if subset_control_vals is None else subset_control_vals genes, X_data = fetch_X_data(adata, genes, layer) if len(genes) == 0: raise ValueError(f"No genes from your genes list appear in your adata object.") if group not in adata.obs.keys(): raise ValueError(f"group {group} is not a valid key for .obs in your adata object.") else: adata.obs[group] = adata.obs[group].astype("str") cluster_set = adata.obs[group].unique() if len(cluster_set) < 2: raise ValueError(f"the number of groups for the argument {group} must be at least two.") de_tables = [None] * len(cluster_set) de_genes = {} if len(cluster_set) > 2: for i, test_group in enumerate(cluster_set): control_groups = sorted(set(cluster_set).difference([test_group])) de = two_groups_degs( adata, genes, layer, group, test_group, control_groups, X_data, exp_frac_thresh, log2_fc_thresh, qval_thresh, subset_control_vals, ) de_tables[i] = de.copy() de_genes[i] = [k for k, v in Counter(de["gene"]).items() if v >= de_frequency] else: de = two_groups_degs( adata, genes, layer, group, cluster_set[0], cluster_set[1], X_data, exp_frac_thresh, log2_fc_thresh, qval_thresh, subset_control_vals, ) de_tables[0] = de.copy() de_genes[0] = [k for k, v in Counter(de["gene"]).items() if v >= de_frequency] de_table = pd.concat(de_tables).reset_index().drop(columns=["index"]) de_table["log2_fc"] = de_table["log2_fc"].astype("float") adata.uns["cluster_markers"] = {"deg_table": de_table} for key, value in de_genes.items(): adata.uns["cluster_markers"]["de_genes" + str(key)] = value return adata
[docs]def two_groups_degs( adata: AnnData, genes: Optional[List[str]] = None, layer: Optional[str] = None, group: Optional[str] = None, test_group: Optional[str] = None, control_groups: List[str] = None, X_data: Optional[np.ndarray] = None, exp_frac_thresh: Optional[float] = None, log2_fc_thresh: Optional[str] = None, qval_thresh: float = 0.05, subset_control_vals: Optional[bool] = None, ) -> pd.DataFrame: """Find marker genes between two groups of cells based on gene expression or velocity as specified by the layer. Tests each gene for differential expression between cells in one group to cells from another groups via Mann-Whitney U test. It also calculates the fraction of cells with non-zero expression, log 2-fold changes as well as the specificity (calculated as 1 - Jessen-Shannon distance between the distribution of percentage of cells with expression across all groups to the hypothetical perfect distribution in which only the current group of cells has expression). In addition, Rank-biserial correlation (rbc) and qval are calculated. The rank biserial correlation is used to assess the relationship between a dichotomous categorical variable and an ordinal variable. The rank biserial test is very similar to the non-parametric Mann-Whitney U test that is used to compare two independent groups on an ordinal variable. Mann-Whitney U tests are preferable to rank biserial correlations when comparing independent groups. Rank biserial correlations can only be used with dichotomous (two levels) categorical variables. qval is calculated using Benjamini-Hochberg adjustment. Args: adata: An AnnData object. genes: The list of genes that will be used to subset the data for dimension reduction and clustering. If `None`, all genes will be used. layer: The layer that will be used to retrieve data for dimension reduction and clustering. If `None`, .X is used. group: The column key/name that identifies the grouping information (for example, clusters that correspond to different cell types or different time points) of cells. This will be used for calculating group-specific genes. test_group: The group name from `group` for which markers has to be found. control_groups: The list of group name(s) from `group` for which markers has to be tested against. X_data: the user supplied data that will be used for marker gene detection directly. Defaults to None. exp_frac_thresh: the minimum percentage of cells with expression for a gene to proceed differential expression test. If `layer` is not `velocity` related (i.e. `velocity_S`), `exp_frac_thresh` by default is set to be 0.1, otherwise 0. Defaults to None. log2_fc_thresh: The minimal threshold of log2 fold change for a gene to proceed differential expression test. If `layer` is not `velocity` related (i.e. `velocity_S`), `log2_fc_thresh` by default is set to be 1, otherwise 0. Defaults to None. qval_thresh: The maximal threshold of qval to be considered as significant genes. Defaults to 0.05. subset_control_vals: Whether to subset the top ranked control values. When `subset_control_vals = None`, this is subset to be `True` when layer is not related to either `velocity` related or `acceleration` or `curvature` related layers and `False` otherwise. When layer is not related to either `velocity` related or `acceleration` or `curvature` related layers used, the control values will be sorted by absolute values. Defaults to None. Raises: ValueError: `X_data` is provided but `genes` does not correspond to its columns. Returns: A pandas DataFrame of the differential expression analysis result between the two groups. """ if layer is None or not (layer.startswith("velocity") or layer in ["acceleration", "curvature"]): exp_frac_thresh = 0.1 if exp_frac_thresh is None else exp_frac_thresh log2_fc_thresh = 1 if log2_fc_thresh is None else log2_fc_thresh subset_control_vals = True if subset_control_vals is None else subset_control_vals else: exp_frac_thresh = 0 if exp_frac_thresh is None else exp_frac_thresh log2_fc_thresh = None subset_control_vals = False if subset_control_vals is None else subset_control_vals if X_data is None: genes, X_data = fetch_X_data(adata, genes, layer) else: if genes is None or len(genes) != X_data.shape[1]: raise ValueError( f"When providing X_data, a list of genes name that corresponds to the columns of X_data " f"must be provided" ) n_cells, n_genes = X_data.shape sparse = issparse(X_data) if type(control_groups) == str: control_groups = [control_groups] test_cells, control_cells = ( adata.obs[group] == test_group, adata.obs[group].isin(control_groups), ) num_test_cells = test_cells.sum() num_groups = len(control_groups) if subset_control_vals: min_n = [min(num_test_cells, sum(adata.obs[group] == x)) for x in control_groups] n1n2 = [num_test_cells * x for x in min_n] de = [] for i_gene, gene in tqdm(enumerate(genes), desc="identifying top markers for each group"): rbc, specificity_, mw_p, log_fc, ncells = 0, 0, 1, 0, 0 all_vals = X_data[:, i_gene].A if sparse else X_data[:, i_gene] test_vals = all_vals[test_cells] perc, ef = [len(test_vals.nonzero()[0]) / n_cells], len(test_vals.nonzero()[0]) / num_test_cells if ef < exp_frac_thresh: continue log_mean_test_vals = np.log2(test_vals.mean()) perc.extend([len(all_vals[adata.obs[group] == x].nonzero()[0]) / n_cells for x in control_groups]) for i in range(num_groups): control_vals = all_vals[adata.obs[group] == control_groups[i]] if subset_control_vals: if layer is None or not (layer.startswith("velocity") or layer in ["acceleration", "curvature"]): control_vals.sort() else: control_vals = control_vals[np.argsort(np.absolute(control_vals))] control_vals = control_vals[-min_n[i] :] cur_n1n2 = n1n2[i] else: cur_n1n2 = num_test_cells * len(control_vals) if log2_fc_thresh is not None: mean_control_vals = control_vals.mean() if mean_control_vals == 0: log_fc = np.inf else: log_fc = log_mean_test_vals - np.log2(mean_control_vals) if abs(log_fc) < log2_fc_thresh: continue else: log_fc = test_vals.mean() - control_vals.mean() # for curvature, acceleration, log fc is meaningless try: u, mw_p = map(float, mannwhitneyu(test_vals, control_vals)) except ValueError: pass else: rbc = 1 - ((2 * u) / cur_n1n2) perfect_specificity = np.repeat(0.0, num_groups + 1) perfect_specificity[i + 1] = 1.0 specificity_ = specificity(perc, perfect_specificity) tmp0, tmp1 = sum(np.sign(test_vals) > 0), sum(np.sign(control_vals) > 0) tmp0 = tmp0 if np.isscalar(tmp0) else tmp0[0] tmp1 = tmp0 if np.isscalar(tmp1) else tmp1[0] diff_ratio_pos = tmp0 / len(test_vals) - tmp1 / len(control_vals) de.append( ( gene, control_groups[i], ef, rbc, log_fc, mw_p, specificity_, diff_ratio_pos, ) ) de = pd.DataFrame( de, columns=[ "gene", "versus_group", "exp_frac", "rbc", "log2_fc", "pval", "specificity", "diff_ratio_pos", ], ) de = de[de.iloc[:, 2:].sum(1) > 0] if de.shape[0] > 1: de["qval"] = multipletests(de["pval"].values, method="fdr_bh")[1] else: de["qval"] = [np.nan for _ in range(de.shape[0])] de["test_group"] = [test_group for _ in range(de.shape[0])] out_order = [ "gene", "test_group", "versus_group", "specificity", "exp_frac", "diff_ratio_pos", "rbc", "log2_fc", "pval", "qval", ] de = de[out_order].sort_values(by="qval") res = de[(de.qval < qval_thresh)].reset_index().drop(columns=["index"]) return res
[docs]def top_n_markers( adata: AnnData, with_moran_i: bool = False, group_by: str = "test_group", sort_by: Union[str, List[str]] = "specificity", sort_order: Literal["increasing", "decreasing"] = "decreasing", top_n_genes: int = 5, exp_frac_thresh: float = 0.1, log2_fc_thresh: Optional[float] = None, qval_thresh: float = 0.05, specificity_thresh: float = 0.3, only_gene_list: bool = False, display: bool = True, ) -> Union[List[List[str]], pd.DataFrame]: """Filter cluster deg (Moran's I test) results and retrieve top markers for each cluster. Args: adata: An AnnData object. with_moran_i: Whether to include Moran's I test results for selecting top marker genes. Defaults to False. group_by: Column name or names to group by. Defaults to "test_group". sort_by: Column name or names to sort by. Defaults to "specificity". sort_order: Whether to sort the data frame with `increasing` or `decreasing` order. Defaults to "decreasing". top_n_genes: The number of top sorted markers. Defaults to 5. exp_frac_thresh: The minimum percentage of cells with expression for a gene to proceed selection of top markers. Defaults to 0.1. log2_fc_thresh: The minimal threshold of log2 fold change for a gene to proceed selection of top markers. Applicable to none `velocity`, `acceleration` or `curvature` layers based DEGs. Defaults to None. qval_thresh: The maximal threshold of qval to be considered as top markers. Defaults to 0.05. specificity_thresh: The minimum threshold of specificity to be considered as top markers. Defaults to 0.3. only_gene_list: Whether to only return the gene list for each cluster. Defaults to False. display: Whether to print the data frame for the top marker genes after the filtering. Defaults to True. Raises: ValueError: Threshold too extreme that no genes passed the filter. Returns: If `only_gene_list` is false, a data frame that stores the top marker for each group would be returned. Otherwise, a list containing lists of genes in each cluster would be returned. In addition, it will display the data frame depending on whether `display` is set to be True. """ if "cluster_markers" not in adata.uns.keys(): main_warning( f"No info of cluster markers stored in your adata. " f"Running `find_group_markers` with default parameters." ) adata = find_group_markers(adata, group="clusters") deg_table = adata.uns["cluster_markers"]["deg_table"] if len(deg_table["log2_fc"].unique()) > 1: if np.abs(deg_table["log2_fc"]).max() < 1: log2_fc_thresh = -np.inf if log2_fc_thresh is None else log2_fc_thresh else: log2_fc_thresh = 1 if log2_fc_thresh is None else log2_fc_thresh deg_table = deg_table.query( "exp_frac > @exp_frac_thresh and " "log2_fc > @log2_fc_thresh and " "qval < @qval_thresh and " "specificity > @specificity_thresh" ) else: deg_table = deg_table.query( "exp_frac > @exp_frac_thresh and " "qval < @qval_thresh and " "specificity > @specificity_thresh" ) if deg_table.shape[0] == 0: raise ValueError( f"Looks like your filter threshold is too extreme. No gene detected. " f"Please try relaxing the thresholds you specified: " f"exp_frac_thresh: {exp_frac_thresh}" f"log2_fc_thresh: {log2_fc_thresh}" f"qval_thresh: {qval_thresh}" f"specificity_thresh: {specificity_thresh}" ) if with_moran_i: moran_i_columns = ["moran_i", "moran_p_val", "moran_q_val", "moran_z"] if len(adata.var.columns.intersection(moran_i_columns)) != 4: main_warning( f"No info of cluster markers stored in your adata. " f"Running `find_group_markers` with default parameters." ) adata = moran_i(adata) moran_i_df = adata.var[moran_i_columns] deg_table = deg_table.merge(moran_i_df, left_on="gene", right_index=True, how="left") top_n_df = ( deg_table.groupby(group_by).apply(lambda grp: grp.nlargest(top_n_genes, sort_by)) if sort_order == "decreasing" else deg_table.groupby(group_by).apply(lambda grp: grp.nsmallest(top_n_genes, sort_by)) ) if display: with pd.option_context( "display.max_rows", None, "display.max_columns", None, "display.width", 1000, ): print(top_n_df) top_n_groups = top_n_df.loc[:, group_by].unique() de_genes = [None] * len(top_n_groups) if only_gene_list: for i in top_n_groups: de_genes[i] = top_n_df[top_n_df[group_by] == i].loc[:, "gene"].to_list() return de_genes else: return top_n_df
[docs]def glm_degs( adata: AnnData, X_data: Optional[np.ndarray] = None, genes: Optional[List[str]] = None, layer: Optional[str] = None, fullModelFormulaStr: str = "~cr(integral_time, df=3)", reducedModelFormulaStr: str = "~1", family: Literal["NB2"] = "NB2", ) -> None: """Differential genes expression tests using generalized linear regressions. The results would be stored in the adata's .uns["glm_degs"] annotation and the update is inplace. Tests each gene for differential expression as a function of integral time (the time estimated via the reconstructed vector field function) or pseudotime using generalized additive models with natural spline basis. This function can also use other covariates as specified in the full (i.e `~clusters`) and reduced model formula to identify differentially expression genes across different categories, group, etc. glm_degs relies on statsmodels package and is adapted from the `differentialGeneTest` function in Monocle. Note that glm_degs supports performing deg analysis for any layer or normalized data in your adata object. That is you can either use the total, new, unspliced or velocity, etc. for the differential expression analysis. Args: adata: An AnnData object. X_data: The user supplied data that will be used for differential expression analysis directly. Defaults to None. genes: The layer that will be used to retrieve data for dimension reduction and clustering. If `None`, .X is used. Defaults to None. layer: The layer that will be used to retrieve data for dimension reduction and clustering. If `None`, .X is used. Defaults to None. fullModelFormulaStr: A formula string specifying the full model in differential expression tests (i.e. likelihood ratio tests) for each gene/feature. Defaults to "~cr(integral_time, df=3)". reducedModelFormulaStr: A formula string specifying the reduced model in differential expression tests (i.e. likelihood ratio tests) for each gene/feature. Defaults to "~1". family: The distribution family used for the expression responses in statsmodels. Currently, always uses `NB2` and this is ignored. NB model requires us to define a parameter alpha which it uses to express the variance in terms of the mean as follows: variance = mean + alpha mean^p. When p=2, it corresponds to the NB2 model. In order to obtain the correct parameter alpha (sm.genmod.families.family.NegativeBinomial (link=None, alpha=1.0), by default it is 1), we use the auxiliary OLS regression without a constant from Messrs Cameron and Trivedi. More details can be found here: https://towardsdatascience.com/negative-binomial-regression-f99031bb25b4. Defaults to "NB2". Raises: ValueError: `X_data` is provided but `genes` does not correspond to its columns. Exception: Factors from the model formula `fullModelFormulaStr` invalid. """ if X_data is None: genes, X_data = fetch_X_data(adata, genes, layer) else: if genes is None or len(genes) != X_data.shape[1]: raise ValueError( f"When providing X_data, a list of genes name that corresponds to the columns of X_data " f"must be provided" ) norm_method_key = "X_norm_method" if layer is None or layer == "X" else "layers_norm_method" if layer is None: if issparse(X_data): X_data.data = ( 2**X_data.data - 1 if adata.uns["pp"][norm_method_key] == "log2" else np.exp(X_data.data) - 1 if adata.uns["pp"][norm_method_key] == "log" else _Freeman_Tukey(X_data.data + 1, inverse=True) - 1 if adata.uns["pp"][norm_method_key] == "Freeman_Tukey" else X_data.data ) else: X_data = ( 2**X_data - 1 if adata.uns["pp"][norm_method_key] == "log2" else np.exp(X_data) - 1 if adata.uns["pp"][norm_method_key] == "log" else _Freeman_Tukey(X_data, inverse=True) if adata.uns["pp"][norm_method_key] == "Freeman_Tukey" else X_data ) factors = get_all_variables(fullModelFormulaStr) factors = ["Pseudotime" if i == "cr(Pseudotime, df=3)" else i for i in factors] if len(set(factors).difference(adata.obs.columns)) == 0: df_factors = adata.obs[factors] else: raise Exception( f"adata object doesn't include the factors from the model formula " f"{fullModelFormulaStr} you provided." ) sparse = issparse(X_data) deg_df = pd.DataFrame(index=genes, columns=["status", "family", "pval"]) for i, gene in tqdm( enumerate(genes), "Detecting time dependent genes via Generalized Additive Models (GAMs)", ): expression = X_data[:, i].A if sparse else X_data[:, i] df_factors["expression"] = expression deg_df.iloc[i, :] = diff_test_helper(df_factors, fullModelFormulaStr, reducedModelFormulaStr) deg_df["qval"] = multipletests(deg_df["pval"], method="fdr_bh")[1] adata.uns["glm_degs"] = deg_df
def diff_test_helper( data: pd.DataFrame, fullModelFormulaStr: str = "~cr(time, df=3)", reducedModelFormulaStr: str = "~1", ) -> Union[Tuple[Literal["fail"], Literal["NB2"], Literal[1]], Tuple[Literal["ok"], Literal["NB2"], np.ndarray],]: """A helper function to generate required data fields for differential gene expression test. Args: data: The original dataframe containing expression data. fullModelFormulaStr: A formula string specifying the full model in differential expression tests (i.e. likelihood ratio tests) for each gene/feature. Defaults to "~cr(integral_time, df=3)". reducedModelFormulaStr: A formula string specifying the reduced model in differential expression tests (i.e. likelihood ratio tests) for each gene/feature. Defaults to "~1". Returns: A tuple [parseResult, family, pval], where `parseResult` should be "ok" or "fail", showing whether the provided dataframe is successfully parsed or not. `family` is the distribution family used for the expression responses in statsmodels, currently only "NB2" is supported. `pval` is the survival probability (1-cumulative probability) to observe the likelihood ratio for the constrained model to be true. If parsing dataframe failed, this value is set to be 1. """ # Dividing data into train and validation datasets transformed_x = dmatrix(fullModelFormulaStr, data, return_type="dataframe") transformed_x_null = dmatrix(reducedModelFormulaStr, data, return_type="dataframe") expression = data["expression"] try: # poisson_training_results = sm.GLM(expression, transformed_x, family=sm.families.Poisson()).fit() # poisson_df = pd.DataFrame({"mu": poisson_training_results.mu, "expression": expression}) # poisson_df["AUX_OLS_DEP"] = poisson_df.apply( # lambda x: ((x["expression"] - x["mu"]) ** 2 - x["expression"]) / x["mu"], # axis=1, # ) # ols_expr = """AUX_OLS_DEP ~ mu - 1""" # aux_olsr_results = smf.ols(ols_expr, poisson_df).fit() nb2_family = sm.families.NegativeBinomial() # (alpha=aux_olsr_results.params[0]) nb2_full = sm.GLM(expression, transformed_x, family=nb2_family).fit() nb2_null = sm.GLM(expression, transformed_x_null, family=nb2_family).fit() except: return ("fail", "NB2", 1) pval = lrt(nb2_full, nb2_null) return ("ok", "NB2", pval) def get_all_variables(formula: str) -> List[str]: """A helper function to get all variable names for a formula string. Args: formula: The formula string specifying the model in differential expression data. Returns: A list of variable names. """ md = patsy.ModelDesc.from_formula(formula) termlist = md.rhs_termlist + md.lhs_termlist factors = [] for term in termlist: for factor in term.factors: factors.append(factor.name()) return factors def lrt(full: GLMResultsWrapper, restr: GLMResultsWrapper) -> np.float64: """Perform likelihood-ratio test on the full model and constrained model. Args: full: The regression model without constraint. restr: The regression model after constraint. Returns: The survival probability (1-cumulative probability) to observe the likelihood ratio for the constrained model to be true. """ llf_full = full.llf llf_restr = restr.llf df_full = full.df_resid df_restr = restr.df_resid lrdf = df_restr - df_full lrstat = -2 * (llf_restr - llf_full) lr_pvalue = stats.chi2.sf(lrstat, df=lrdf) return lr_pvalue def specificity(percentage: np.ndarray, perfect_specificity: np.ndarray) -> float: """Calculate specificity""" spec = 1 - JSdistVec(makeprobsvec(percentage), perfect_specificity) return spec def makeprobsvec(p: np.ndarray) -> np.ndarray: """Calculate the probability matrix for a relative abundance matrix""" phat = p / np.sum(p) phat[np.isnan((phat))] = 0 return phat def shannon_entropy(p: np.ndarray) -> float: """Calculate the Shannon entropy based on the probability vector""" if np.min(p) < 0 or np.sum(p) <= 0: return np.inf p_norm = p[p > 0] / np.sum(p) return -np.sum(np.log(p_norm) * p_norm) def JSdistVec(p: np.ndarray, q: np.ndarray) -> float: """Calculate the Jessen-Shannon distance for two probability distribution""" Jsdiv = shannon_entropy((p + q) / 2) - (shannon_entropy(p) + shannon_entropy(q)) / 2 if np.isinf(Jsdiv): Jsdiv = 1 if Jsdiv < 0: Jsdiv = 0 JSdist = np.sqrt(Jsdiv) return JSdist