Source code for dynamo.plot.preprocess

from typing import Any, Dict, List, Optional, Tuple

try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from anndata import AnnData
from matplotlib.axes import Axes
from matplotlib.collections import PathCollection
from scipy.sparse import csr_matrix, issparse

from ..configuration import DynamoAdataKeyManager
from ..dynamo_logger import main_warning
from ..preprocessing import gene_selection
from ..preprocessing.gene_selection import get_prediction_by_svr
from ..preprocessing.utils import detect_experiment_datatype
from ..tools.utils import get_mapper
from .utils import save_fig, save_show_ret


[docs]def basic_stats( adata: AnnData, group: Optional[str] = None, figsize: Tuple[float, float] = (4, 3), save_show_or_return: Literal["save", "show", "return"] = "show", save_kwargs: Dict[str, Any] = {}, ) -> Optional[sns.FacetGrid]: """Plot the basic statics (nGenes, nCounts and pMito) of each category of adata. Args: adata: an AnnData object. group: the column key of `adata.obs` to facet the data into subplots. If None, no faceting will be used. Defaults to None. figsize: the size of each panel in the figure. Defaults to (4, 3). save_show_or_return: whether to save, show, or return the plots. Could be one of 'save', 'show', or 'return'. Defaults to "show". save_kwargs: a dictionary that will be passed to the save_show_ret function. By default, it is an empty dictionary and the save_show_ret function will use the {"path": None, "prefix": 'basic_stats', "dpi": None, "ext": 'pdf', "transparent": True, "close": True, "verbose": True} as its parameters. Otherwise, you can provide a dictionary that properly modify those keys according to your needs. Defaults to {}. Returns: None would be returned by default. If `save_show_or_return` is set to 'return', the generated figure (seaborn.FacetGrid) would be returned. """ if len(adata.obs.columns.intersection(["nGenes", "nCounts", "pMito"])) != 3: from ..preprocessing.QC import basic_stats basic_stats(adata) df = pd.DataFrame( { "nGenes": adata.obs["nGenes"], "nCounts": adata.obs["nCounts"], "pMito": adata.obs["pMito"], }, index=adata.obs.index, ) if group is not None and group in adata.obs.columns: df["group"] = adata.obs.loc[:, group] res = df.melt(value_vars=["nGenes", "nCounts", "pMito"], id_vars=["group"]) else: res = df.melt(value_vars=["nGenes", "nCounts", "pMito"]) # https://wckdouglas.github.io/2016/12/seaborn_annoying_title g = sns.FacetGrid( res, col="variable", sharex=False, sharey=False, margin_titles=True, hue="variable", height=figsize[1], aspect=figsize[0] / figsize[1], ) if group is None: g.map_dataframe(sns.violinplot, x="variable", y="value") g.set_xticklabels([]) g.set(xticks=[]) else: if res["group"].dtype.name == "category": xticks = res["group"].cat.categories else: xticks = np.sort(res["group"].unique()) kws = dict(order=xticks) g.map_dataframe(sns.violinplot, x="group", y="value", **kws) g.set_xticklabels(rotation=-30) [plt.setp(ax.texts, text="") for ax in g.axes.flat] # remove the original texts # important to add this before setting titles g.set_titles(row_template="{row_name}", col_template="{col_name}") g.set_xlabels("") g.set_ylabels("") g.set(ylim=(0, None)) return save_show_ret("basic_stats", save_show_or_return, save_kwargs, g)
[docs]def show_fraction( adata: AnnData, genes: Optional[List[str]] = None, group: Optional[str] = None, figsize: Tuple[float, float] = (4, 3), save_show_or_return: Literal["save", "show", "return"] = "show", save_kwargs: Dict[str, Any] = {}, ) -> Optional[sns.FacetGrid]: """Plot the fraction of each category of data used in the velocity estimation. Args: adata: an AnnData object. genes: the list of gene names from which the fraction will be calculated. Defaults to None. group: the column key of `adata.obs` to facet the data into subplots. If None, no faceting will be used. Defaults to None. figsize: the size of each panel in the figure. Defaults to (4, 3). save_show_or_return: whether to save, show, or return the plots. Could be one of 'save', 'show', or 'return'. Defaults to "show". save_kwargs: a dictionary that will be passed to the save_show_ret function. By default, it is an empty dictionary and the save_show_ret function will use the {"path": None, "prefix": 'show_fraction', "dpi": None, "ext": 'pdf', "transparent": True, "close": True, "verbose": True} as its parameters. Otherwise, you can provide a dictionary that properly modify those keys according to your needs. Defaults to {}. Raises: ValueError: `genes` does not contain any genes from the adata object. ValueError: `adata` does not have proper splicing or labeling data. Returns: None would be returned by default. If `save_show_or_return` is set to 'return', the generated figure (seaborn.FacetGrid) would be returned. """ if genes is not None: genes = list(adata.var_names.intersection(genes)) if len(genes) == 0: raise ValueError("The gene list you provided doesn't much any genes from the adata object.") mode = None if pd.Series(["spliced", "unspliced"]).isin(adata.layers.keys()).all(): mode = "splicing" elif pd.Series(["new", "total"]).isin(adata.layers.keys()).all(): mode = "labelling" elif pd.Series(["uu", "ul", "su", "sl"]).isin(adata.layers.keys()).all(): mode = "full" if not (mode in ["labelling", "splicing", "full"]): raise ValueError("your data doesn't seem to have either splicing or labeling or both information") if mode == "labelling": new_mat, total_mat = ( (adata.layers["new"], adata.layers["total"]) if genes is None else ( adata[:, genes].layers["new"], adata[:, genes].layers["total"], ) ) new_cell_sum, tot_cell_sum = ( (np.sum(new_mat, 1), np.sum(total_mat, 1)) if not issparse(new_mat) else (new_mat.sum(1).A1, total_mat.sum(1).A1) ) new_frac_cell = new_cell_sum / tot_cell_sum old_frac_cell = 1 - new_frac_cell df = pd.DataFrame( {"new_frac_cell": new_frac_cell, "old_frac_cell": old_frac_cell}, index=adata.obs.index, ) if group is not None and group in adata.obs.keys(): df["group"] = adata.obs[group] res = df.melt(value_vars=["new_frac_cell", "old_frac_cell"], id_vars=["group"]) else: res = df.melt(value_vars=["new_frac_cell", "old_frac_cell"]) elif mode == "splicing": if "ambiguous" in adata.layers.keys(): ambiguous = adata.layers["ambiguous"] if genes is None else adata[:, genes].layers["ambiguous"] else: ambiguous = csr_matrix(np.array([[0]])) if issparse(adata.layers["unspliced"]) else np.array([[0]]) unspliced_mat, spliced_mat, ambiguous_mat = ( adata.layers["unspliced"] if genes is None else adata[:, genes].layers["unspliced"], adata.layers["spliced"] if genes is None else adata[:, genes].layers["spliced"], ambiguous, ) un_cell_sum, sp_cell_sum = ( (np.sum(unspliced_mat, 1), np.sum(spliced_mat, 1)) if not issparse(unspliced_mat) else (unspliced_mat.sum(1).A1, spliced_mat.sum(1).A1) ) if "ambiguous" in adata.layers.keys(): am_cell_sum = ambiguous_mat.sum(1).A1 if issparse(unspliced_mat) else np.sum(ambiguous_mat, 1) tot_cell_sum = un_cell_sum + sp_cell_sum + am_cell_sum un_frac_cell, sp_frac_cell, am_frac_cell = ( un_cell_sum / tot_cell_sum, sp_cell_sum / tot_cell_sum, am_cell_sum / tot_cell_sum, ) df = pd.DataFrame( { "unspliced": un_frac_cell, "spliced": sp_frac_cell, "ambiguous": am_frac_cell, }, index=adata.obs.index, ) else: tot_cell_sum = un_cell_sum + sp_cell_sum un_frac_cell, sp_frac_cell = ( un_cell_sum / tot_cell_sum, sp_cell_sum / tot_cell_sum, ) df = pd.DataFrame( {"unspliced": un_frac_cell, "spliced": sp_frac_cell}, index=adata.obs.index, ) if group is not None and group in adata.obs.columns: df["group"] = adata.obs.loc[:, group] res = ( df.melt( value_vars=["unspliced", "spliced", "ambiguous"], id_vars=["group"], ) if "ambiguous" in adata.layers.keys() else df.melt(value_vars=["unspliced", "spliced"], id_vars=["group"]) ) else: res = ( df.melt(value_vars=["unspliced", "spliced", "ambiguous"]) if "ambiguous" in adata.layers.keys() else df.melt(value_vars=["unspliced", "spliced"]) ) elif mode == "full": uu, ul, su, sl = ( adata.layers["uu"] if genes is None else adata[:, genes].layers["uu"], adata.layers["ul"] if genes is None else adata[:, genes].layers["ul"], adata.layers["su"] if genes is None else adata[:, genes].layers["su"], adata.layers["sl"] if genes is None else adata[:, genes].layers["sl"], ) uu_sum, ul_sum, su_sum, sl_sum = ( (np.sum(uu, 1), np.sum(ul, 1), np.sum(su, 1), np.sum(sl, 1)) if not issparse(uu) else ( uu.sum(1).A1, ul.sum(1).A1, su.sum(1).A1, sl.sum(1).A1, ) ) tot_cell_sum = uu_sum + ul_sum + su_sum + sl_sum uu_frac, ul_frac, su_frac, sl_frac = ( uu_sum / tot_cell_sum, ul_sum / tot_cell_sum, su_sum / tot_cell_sum, sl_sum / tot_cell_sum, ) df = pd.DataFrame( { "uu_frac": uu_frac, "ul_frac": ul_frac, "su_frac": su_frac, "sl_frac": sl_frac, }, index=adata.obs.index, ) if group is not None and group in adata.obs.keys(): df["group"] = adata.obs[group] res = df.melt( value_vars=["uu_frac", "ul_frac", "su_frac", "sl_frac"], id_vars=["group"], ) else: res = df.melt(value_vars=["uu_frac", "ul_frac", "su_frac", "sl_frac"]) g = sns.FacetGrid( res, col="variable", sharex=False, sharey=False, margin_titles=True, hue="variable", height=figsize[1], aspect=figsize[0] / figsize[1], ) if group is None: g.map_dataframe(sns.violinplot, x="variable", y="value") g.set_xticklabels([]) g.set(xticks=[]) else: if res["group"].dtype.name == "category": xticks = res["group"].cat.categories else: xticks = np.sort(res["group"].unique()) kws = dict(order=xticks) g.map_dataframe(sns.violinplot, x="group", y="value", **kws) g.set_xticklabels(rotation=-30) [plt.setp(ax.texts, text="") for ax in g.axes.flat] # remove the original texts # important to add this before setting titles g.set_titles(row_template="{row_name}", col_template="{col_name}") g.set_xlabels("") g.set_ylabels("Fraction") g.set(ylim=(0, None)) return save_show_ret("show_fraction", save_show_or_return, save_kwargs, g)
[docs]def variance_explained( adata: AnnData, threshold: float = 0.002, n_pcs: Optional[int] = None, figsize: Tuple[float, float] = (4, 3), save_show_or_return: Literal["save", "show", "return"] = "show", save_kwargs: Dict[str, Any] = {}, ) -> Axes: """Plot the accumulative variance explained by the principal components. Args: adata: an AnnDate object. threshold: the threshold for the second derivative of the cumulative sum of the variance for each principal component. This threshold is used to determine the number of principle components used for downstream non- linear dimension reduction. Defaults to 0.002. n_pcs: the number of principal components. If None, the number of components would be inferred automatically. Defaults to None. figsize: the size of each panel of the figure. Defaults to (4, 3). save_show_or_return: whether to save, show, or return the generated figure. Can be one of 'save', 'show', or 'return'. Defaults to "show". save_kwargs: a dictionary that will be passed to the save_show_ret function. By default, it is an empty dictionary and the save_show_ret function will use the {"path": None, "prefix": 'variance_explained', "dpi": None, "ext": 'pdf', "transparent": True, "close": True, "verbose": True} as its parameters. Otherwise, you can provide a dictionary that properly modify those keys according to your needs. Defaults to {}. Returns: None would be returned by default. If `save_show_or_return` is set to be 'return', the matplotlib Axes of the figure would be returned. """ var_ = adata.uns["explained_variance_ratio_"] _, ax = plt.subplots(figsize=figsize) ax.plot(var_, c="r") tmp = np.diff(np.diff(np.cumsum(var_)) > threshold) n_comps = n_pcs if n_pcs is not None else np.where(tmp)[0][0] if np.any(tmp) else 20 ax.axvline(n_comps, c="r") ax.set_xlabel("PCs") ax.set_ylabel("Variance explained") ax.set_xticks(list(ax.get_xticks()) + [n_comps]) ax.set_xlim(0, len(var_)) return save_show_ret("variance_explained", save_show_or_return, save_kwargs, ax)
[docs]def biplot( adata: AnnData, pca_components: Tuple[int, int] = [0, 1], pca_key: str = "X_pca", loading_key: str = "PCs", figsize: Tuple[float, float] = (6, 4), scale_pca_embedding: bool = False, draw_pca_embedding: bool = False, show_text: bool = False, save_show_or_return: Literal["save", "show", "return"] = "show", save_kwargs: Dict[str, Any] = {}, ax: Optional[Axes] = None, ) -> Axes: """A biplot overlays a score plot and a loadings plot in a single graph. In such a plot, points are the projected observations; vectors are the projected variables. If the data are well- approximated by the first two principal components, a biplot enables you to visualize high-dimensional data by using a two-dimensional graph. See more at: https://blogs.sas.com/content/iml/2019/11/06/what-are-biplots.html In general, the score plot and the loadings plot will have different scales. Consequently, you need to rescale the vectors or observations (or both) when you overlay the score and loadings plots. There are four common choices of scaling. Each scaling emphasizes certain geometric relationships between pairs of observations (such as distances), between pairs of variables (such as angles), or between observations and variables. This article discusses the geometry behind two-dimensional biplots and shows how biplots enable you to understand relationships in multivariate data. Args: adata: an AnnData object that has pca and loading information prepared. pca_components: the index of the pca components in loading matrix. Defaults to [0, 1]. pca_key: the key to the pca embedding matrix in `adata.obsm`. Defaults to "X_pca". loading_key: the key to the pca loading matrix in either `adata.uns` or `adata.varm`. Defaults to "PCs". figsize: the size of each subplot. Defaults to (6, 4). scale_pca_embedding: whether to scale the pca embedding. Defaults to False. draw_pca_embedding: whether to draw the pca embedding. Defaults to False. show_text: whether to show the text labels on plot. Defaults to False. save_show_or_return: whether to save, show, or return the generated figure. Can be one of 'save', 'show', or 'return'. Defaults to "show". save_kwargs: a dictionary that will be passed to the save_show_ret function. By default, it is an empty dictionary and the save_show_ret function will use the function will use the {"path": None, "prefix": 'variance_explained', "dpi": None, "ext": 'pdf', "transparent": True, "close": True, "verbose": True} as its parameters. Otherwise, you can provide a dictionary that properly modify those keys according to your needs. Defaults to {}. ax: the axes object on which the graph would be plotted. If None, a new axis would be created. Defaults to None. Raises: ValueError: invalid `loading_key`. Returns: None would be returned by default. If `save_show_or_return` is set to be 'return', the matplotlib Axes of the figure would be returned. """ if loading_key in adata.uns.keys(): PCs = adata.uns[loading_key] elif loading_key in adata.varm.keys(): PCs = adata.varm[loading_key] else: raise ValueError(f"No PC matrix {loading_key} found in neither .uns nor .varm.") # rotation matrix xvector = PCs[:, pca_components[0]] yvector = PCs[:, pca_components[1]] # pca components xs = adata.obsm[pca_key][:, pca_components[0]] ys = adata.obsm[pca_key][:, pca_components[1]] # scale pca component if scale_pca_embedding: scalex = 1.0 / (xs.max() - xs.min()) scaley = 1.0 / (ys.max() - ys.min()) else: scalex, scaley = 1, 1 cells = adata.obs_names if ax is None: fig, ax = plt.subplots(1, 1, figsize=figsize) for i in range(len(xvector)): # arrows project features, e.g. genes, as vectors onto PC axes ax.arrow(0, 0, xvector[i] * max(xs), yvector[i] * max(ys), color="r", width=0.0005, head_width=0.0025) if show_text: for i in range(len(xvector)): ax.text(xvector[i] * max(xs) * 1.01, yvector[i] * max(ys) * 1.01, cells[i], color="r") ax.set_xlabel("PC" + str(pca_components[0])) ax.set_ylabel("PC" + str(pca_components[1])) if draw_pca_embedding: for i in range(len(xs)): # circles project cells ax.plot(xs[i] * scalex, ys[i] * scaley, "b", alpha=0.1) if show_text: for i in range(len(xs)): ax.text(xs[i] * scalex * 1.01, ys[i] * scaley * 1.01, list(adata.obs.cluster)[i], color="b", alpha=0.1) return save_show_ret("biplot", save_show_or_return, save_kwargs, ax)
[docs]def loading( adata: AnnData, n_pcs: int = 10, loading_key: str = "PCs", n_top_genes: int = 10, ncol: int = 5, figsize: Tuple[float] = (6, 4), save_show_or_return: Literal["save", "show", "return"] = "show", save_kwargs: Dict[str, Any] = {}, ) -> List[List[Axes]]: """Plot the top absolute pca loading genes. Red text are positive loading genes while black negative loading genes. Args: adata: an AnnData object that has pca and loading information prepared. n_pcs: the number of pca components. Defaults to 10. loading_key: the key to the pca loading matrix. Defaults to "PCs". n_top_genes: the number of top genes with the highest absolute loading score. Defaults to 10. ncol: the number of columns of the subplots. Defaults to 5. figsize: the size of each panel of the figure. Defaults to (6, 4). save_show_or_return: whether to save, show, or return the generated figure. Can be one of 'save', 'show', or 'return'. Defaults to "show". save_kwargs: a dictionary that will be passed to the save_show_ret function. By default, it is an empty dictionary and the save_show_ret function will use the function will use the {"path": None, "prefix": 'biplot', "dpi": None, "ext": 'pdf', "transparent": True, "close": True, "verbose": True} as its parameters. Otherwise, you can provide a dictionary that properly modify those keys according to your needs. Defaults to {}. Raises: ValueError: invalid `loading_key` Returns: None would be returned by default. If `save_show_or_return` is set to be 'return', the matplotlib Axes of the figure would be returned. """ if loading_key in adata.uns.keys(): PCs = adata.uns[loading_key] elif loading_key in adata.varm.keys(): PCs = adata.varm[loading_key] else: raise ValueError(f"No PC matrix {loading_key} found in neither .uns nor .varm.") if n_pcs is None: n_pcs = PCs.shape[1] x = np.arange(n_top_genes) cells = adata.obs_names nrow, ncol = int(n_pcs / ncol), min([ncol, n_pcs]) fig, axes = plt.subplots(nrow, ncol, figsize=(figsize[0] * ncol, figsize[1] * nrow)) for i in np.arange(n_pcs): cur_row, cur_col = int(i / ncol), i % ncol cur_pc = PCs[:, i] cur_sign = np.sign(cur_pc) cur_pc = np.abs(cur_pc) sort_ind, sort_val = np.argsort(cur_pc)[::-1], np.sort(cur_pc)[::-1] axes[cur_row, cur_col].scatter(x, sort_val[: len(x)]) for j in x: axes[cur_row, cur_col].text( x[j], sort_val[j] * 1.01, cells[sort_ind[j]], color="r" if cur_sign[sort_ind[j]] > 0 else "k" ) axes[cur_row, cur_col].set_title("PC " + str(i)) return save_show_ret("loading", save_show_or_return, save_kwargs, axes)
[docs]def feature_genes( adata: AnnData, layer: str = "X", mode: Optional[Literal["cv_dispersion", "fano_dispersion", "seurat_dispersion", "gini"]] = None, figsize: tuple = (4, 3), save_show_or_return: Literal["save", "show", "return"] = "show", save_kwargs: Dict[str, Any] = {}, ) -> Optional[PathCollection]: """Plot selected feature genes on top of the mean vs. dispersion scatterplot. Args: adata: an AnnData object. layer: the data from a particular layer (include X) used for making the feature gene plot. Defaults to "X". mode: the method to select the feature genes (can be either one kind of `dispersion` or `gini`). Defaults to None. figsize: the size of each panel of the figure. Defaults to (4, 3). save_show_or_return: whether to save, show, or return the generated figure. Can be one of 'save', 'show', or 'return'. Defaults to "show". save_kwargs: a dictionary that will be passed to the save_show_ret function. By default, it is an empty dictionary and the save_show_ret function will use the {"path": None, "prefix": 'feature_genes', "dpi": None, "ext": 'pdf', "transparent": True, "close": True, "verbose": True} as its parameters. Otherwise, you can provide a dictionary that properly modify those keys according to your needs. Defaults to {}. Raises: ValueError: vector machine regression result not available in the AnnData object. Returns: None would be returned by default. If `save_show_or_return` is set to be 'return', the `PathCollection` generated with `pyplot.scatter` would be returned. """ mode = adata.uns["feature_selection"] if mode is None else mode layer = DynamoAdataKeyManager.get_available_layer_keys(adata, layer, include_protein=False)[0] uns_store_key = None ordering_genes = adata.var["use_for_pca"] if "use_for_pca" in adata.var.columns else None if "_dispersion" not in mode and mode != "gini": raise ValueError("Invalid mode!.") plt.figure(figsize=figsize) if mode == "gini": mean_key = layer + "_m" variance_key = layer + "_gini" if variance_key not in adata.var.columns: raise ValueError( "Looks like you have not run gene selection yet, try run necessary preprocessing first.") mean = DynamoAdataKeyManager.select_layer_data(adata, layer).mean(0)[0] table = adata.var.loc[:, [variance_key]] table[mean_key] = mean.T else: if mode == "seurat_dispersion": mean_key = "pp_gene_mean" variance_key = "pp_gene_variance" else: prefix = "" if layer == "X" else layer + "_" mean_key = prefix + "log_m" variance_key = prefix + "log_cv" uns_store_key = "velocyto_SVR" if layer == "raw" or layer == "X" else layer + "_velocyto_SVR" if not np.all(pd.Series([mean_key, variance_key]).isin(adata.var.columns)): raise ValueError("Looks like you have not run gene selection yet, try run necessary preprocessing first.") else: table = adata.var.loc[:, [mean_key, variance_key]] table = table.loc[ np.isfinite(table[mean_key]) & np.isfinite(table[variance_key]) ] x_min, x_max = ( np.nanmin(table[mean_key]), np.nanmax(table[mean_key]), ) if mode == "cv_dispersion" or mode == "fano_dispersion": mu_linspace = np.linspace(x_min, x_max, num=1000) mean = adata.uns[uns_store_key]["mean"] cv = adata.uns[uns_store_key]["cv"] svr_gamma = adata.uns[uns_store_key]["svr_gamma"] fit, _ = get_prediction_by_svr(mean, cv, svr_gamma) fit = fit(mu_linspace.reshape(-1, 1)) plt.plot(mu_linspace, fit, alpha=0.4, color="r") valid_ind = ( table.index.isin(ordering_genes.index[ordering_genes]) if ordering_genes is not None else np.ones(table.shape[0], dtype=bool) ) valid_disp_table = table.iloc[valid_ind, :] plt.scatter( valid_disp_table[mean_key], valid_disp_table[variance_key], s=3, alpha=1, color="xkcd:red", ) neg_disp_table = table.iloc[~valid_ind, :] plt.scatter( neg_disp_table[mean_key], neg_disp_table[variance_key], s=3, alpha=0.5, color="xkcd:grey", ) if mode == "gini": plt.xlabel("Mean") plt.ylabel("Variance") else: plt.yscale("log") plt.xlabel("Mean (log)") plt.ylabel("CV (log)") return save_show_ret("feature_genes", save_show_or_return, save_kwargs, plt.gcf())
[docs]def exp_by_groups( adata: AnnData, genes: List[str], layer: Optional[str] = None, group: Optional[str] = None, use_ratio: bool = False, use_smoothed: bool = True, log: bool = True, angle: int = 0, re_order: bool = True, figsize: Tuple[float] = (4, 3), save_show_or_return: Literal["save", "show", "return"] = "show", save_kwargs: Dict[str, Any] = {}, ) -> Optional[sns.FacetGrid]: """Plot the (labeled) expression values of genes across different groups (time points). Args: adata: an AnnData object, genes: the list of genes that you want to plot the gene expression. layer: the layer key containing the expression data. If None, the layer used would be inferred automatically. Defaults to None. group: the key of group information in `adata.obs` that will be plotted against to. If None, no groups will be used. Normally you should supply the column that indicates the time related to the labeling experiment. For example, it can be either the labeling time for a kinetic experiment or the chase time for a degradation experiment. Defaults to None. use_ratio: whether to plot the fraction of expression (for example NTR, new to total ratio) over groups. Defaults to False. use_smoothed: whether to use the smoothed data as gene expression. Defaults to True. log: whether to log1p transform the expression data. Defaults to True. angle: the angle to rotate the xtick labels for the purpose of avoiding overlapping between text. Defaults to 0. re_order: whether to reorder categories before drawing groups on the x-axis. Defaults to True. figsize: the size of each panel of the figure. Defaults to (4, 3). save_show_or_return: whether to save, show, or return the generated figure. Can be one of 'save', 'show', or 'return'. Defaults to "show". save_kwargs: a dictionary that will be passed to the save_show_ret function. By default, it is an empty dictionary and the save_show_ret function will use the {"path": None, "prefix": 'exp_by_groups', "dpi": None, "ext": 'pdf', "transparent": True, "close": True, "verbose": True} as its parameters. Otherwise, you can provide a dictionary that properly modify those keys according to your needs. Defaults to {}. Raises: ValueError: invalid `genes`. ValueError: invalid `group`. ValueError: invalid `layer`. Returns: None would be returned by default. If `save_show_or_return` is set to be 'return', the `FacetGrid` with violin plot generated with seaborn would be returned. """ valid_genes = adata.var_names.intersection(genes) if len(valid_genes) == 0: raise ValueError("The adata object doesn't include any gene from the list you provided!") if group is not None and group not in adata.obs.keys(): raise ValueError(f"The group {group} is not existed in your adata object!") ( has_splicing, has_labeling, splicing_labeling, has_protein, ) = detect_experiment_datatype(adata) if (has_splicing + has_labeling) == 0: layer = "X" if layer is None else layer elif has_splicing and not has_labeling: layer = "X_spliced" if layer is None else layer elif not has_splicing and has_labeling: layer = "X_new" if layer is None else layer elif has_splicing and has_labeling: layer = "X_new" if layer is None else layer if use_smoothed: mapper = get_mapper() layer = mapper[layer] if layer != "X" and layer not in adata.layers.keys(): raise ValueError(f"The layer {layer} is not existed in your adata object!") exprs = adata[:, valid_genes].X if layer == "X" else adata[:, valid_genes].layers[layer] exprs = exprs.A if issparse(exprs) else exprs if use_ratio: ( has_splicing, has_labeling, splicing_labeling, has_protein, ) = detect_experiment_datatype(adata) if has_labeling: if layer.startswith("X_") or layer.startswith("M_"): tot = ( adata[:, valid_genes].layers[mapper["X_total"]] if use_smoothed else adata[:, valid_genes].layers["X_total"] ) tot = tot.A if issparse(tot) else tot exprs = exprs / tot else: exprs = exprs else: if layer.startswith("X_") or layer.startswith("M_"): tot = ( adata[:, valid_genes].layers[mapper["X_unspliced"]] + adata[:, valid_genes].layers[mapper["X_spliced"]] if use_smoothed else adata[:, valid_genes].layers["X_unspliced"] + adata[:, valid_genes].layers["X_spliced"] ) tot = tot.A if issparse(tot) else tot exprs = exprs / tot else: exprs = exprs df = ( pd.DataFrame(np.log1p(exprs), index=adata.obs_names, columns=valid_genes) if log else pd.DataFrame(np.log1p(exprs), index=adata.obs_names, columns=valid_genes) ) if group is not None and group in adata.obs.columns: df["group"] = adata.obs[group] res = df.melt(id_vars=["group"]) else: df["group"] = 1 res = df.melt(id_vars=["group"]) if res["group"].dtype.name == "category": xticks = res["group"].cat.categories.sort_values() if re_order else res["group"].cat.categories else: xticks = np.sort(res["group"].unique()) kws = dict(order=xticks) # https://wckdouglas.github.io/2016/12/seaborn_annoying_title g = sns.FacetGrid( res, row="variable", sharex=False, sharey=False, margin_titles=True, hue="variable", height=figsize[1], aspect=figsize[0] / figsize[1], ) g.map_dataframe(sns.violinplot, x="group", y="value", **kws) g.map_dataframe(sns.pointplot, x="group", y="value", color="k", **kws) if group is None: g.set_xticklabels([]) g.set(xticks=[]) else: g.set_xticklabels(rotation=angle) [plt.setp(ax.texts, text="") for ax in g.axes.flat] # remove the original texts # important to add this before setting titles g.set_titles(row_template="{row_name}", col_template="{col_name}") if log: g.set_ylabels("log(Expression + 1)") else: g.set_ylabels("Expression") g.set_xlabels("") g.set(ylim=(0, None)) return save_show_ret("exp_by_groups", save_show_or_return, save_kwargs, g)
[docs]def highest_frac_genes( adata: AnnData, n_top: int = 30, gene_prefix_list: Optional[List[str]] = None, gene_prefix_only: bool = True, show: bool = True, save_path: str = None, ax: Optional[Axes] = None, gene_annotations: Optional[List[str]] = None, gene_annotation_key: str = "use_for_pca", log: bool = False, store_key: str = "highest_frac_genes", orient: Literal["v", "h"] = "v", figsize: Optional[Tuple[float]] = None, layer: Optional[str] = None, title: Optional[str] = None, v_rotation: float = 35, **kwargs, ) -> Axes: """Plot the top genes. Args: adata: an AnnData object. n_top: the number of top genes to show. Defaults to 30. gene_prefix_list: a list of gene name prefix. Defaults to None. gene_prefix_only: whether to show prefix of genes only. It only takes effect if gene prefix is provided. Defaults to True. show: whether to show the plots. Defaults to True. save_path: the path to save the figure. Defaults to None. ax: the axis on which the graph will be plotted. If None, a new axis would be created. Defaults to None. gene_annotations: annotations for genes, or annotations for gene prefix subsets. Defaults to None. gene_annotation_key: gene annotations key in adata.var. Defaults to "use_for_pca". log: whether to use log scale. Defaults to False. store_key: the key for storing expression percent results. Defaults to "highest_frac_genes". orient: the orientation of the graph. Can be one of 'v' or 'h'. 'v' for genes in x-axis and 'h' for genes on y-axis. Defaults to "v". figsize: the size of each panel of the figure. Defaults to None. layer: layer on which the gene percents will be computed. Defaults to None. title: the title of the figure. Defaults to None. v_rotation: rotation of text sticks when the direction is vertica. Defaults to 35. Raises: ValueError: invalid AnnData object. NotImplementedError: invalid `orient`. Returns: The matplotlib Axes of the figure. """ if ax is None: length = n_top * 0.4 if figsize is None: if orient == "v": fig, ax = plt.subplots(figsize=(length, 5)) else: fig, ax = plt.subplots(figsize=(7, length)) else: fig, ax = plt.subplots(figsize=figsize) if log: ax.set_xscale("log") adata = gene_selection.highest_frac_genes( adata, store_key=store_key, n_top=n_top, layer=layer, gene_prefix_list=gene_prefix_list, gene_prefix_only=gene_prefix_only, ) if adata is None: # something wrong with user input or compute_top_genes_df raise ValueError("Invalid adata. ") top_genes_df, selected_indices = ( adata.uns[store_key]["top_genes_df"], adata.uns[store_key]["selected_indices"], ) # TODO use top genes_df dataframe; however this logic currently # does not fit subset logics and may fail tests. # main_info("Using prexisting top_genes_df in .uns.") # top_genes_df = adata.uns["top_genes_df"] # draw plots sns.boxplot( data=top_genes_df, orient=orient, ax=ax, fliersize=1, showmeans=True, **kwargs, ) if gene_annotations is None: if gene_annotation_key in adata.var: gene_annotations = adata.var[gene_annotation_key][selected_indices] else: main_warning( "%s not in adata.var, ignoring the gene annotation key when plotting" % gene_annotation_key, indent_level=2, ) if orient == "v": ax.set_xticklabels(ax.get_xticklabels(), rotation=v_rotation, ha="right") ax.set_xlabel("genes") ax.set_ylabel("fractions of total counts") if gene_annotations is not None: ax2 = ax.twiny() ax2.set_xlim(ax.get_ylim()) ax2.set_xticks(ax.get_yticks()) ax2.set_xticks(list(range(len(gene_annotations)))) ax2.set_xticklabels(gene_annotations, rotation=v_rotation, ha="left") ax2.set_xlabel(gene_annotation_key) elif orient == "h": ax.set_xlabel("fractions of total counts") ax.set_ylabel("genes") if gene_annotations is not None: ax2 = ax.twinx() ax2.set_ylim(ax.get_ylim()) ax2.set_yticks(ax.get_yticks()) ax2.set_yticks(list(range(len(gene_annotations)))) ax2.set_yticklabels(gene_annotations) ax2.set_ylabel(gene_annotation_key) else: raise NotImplementedError("Invalid orient option") if title is None: if layer is None: ax.set_title("Rank by gene expression fraction") else: ax.set_title("Rank by %s fraction" % layer) if show: plt.show() if save_path: s_kwargs = { "path": save_path, "prefix": "plot_highest_gene", "dpi": None, "ext": "pdf", "transparent": True, "close": True, "verbose": True, } save_fig(**s_kwargs) return ax
# if save_show_or_return == "save": # s_kwargs = { # "path": save_path, # "prefix": "plot_highest_gene", # "dpi": None, # "ext": "pdf", # "transparent": True, # "close": True, # "verbose": True, # } # s_kwargs.update(kwargs) # save_fig(save_path, **s_kargs) # elif save_show_or_return == "show": # plt.show() # else: # return ax