from typing import List, Optional
import numpy as np
from anndata import AnnData
from ..dynamo_logger import LoggerManager
from ..utils import copy_adata
from .connectivity import _gen_neighbor_keys, neighbors
from .utils import update_dict
from .utils_reduceDimension import prepare_dim_reduction, run_reduce_dim
[docs]def reduceDimension(
adata: AnnData,
X_data: np.ndarray = None,
genes: Optional[List[str]] = None,
layer: Optional[str] = None,
basis: Optional[str] = "pca",
dims: Optional[List[int]] = None,
n_pca_components: int = 30,
n_components: int = 2,
n_neighbors: int = 30,
reduction_method: str = "umap",
embedding_key: Optional[str] = None,
neighbor_key: Optional[str] = None,
enforce: bool = False,
cores: int = 1,
copy: bool = False,
**kwargs,
) -> Optional[AnnData]:
"""Compute a low dimension reduction projection of an AnnData object first with PCA, followed by non-linear
dimension reduction methods
Args:
adata: an AnnData object.
X_data: the user supplied data that will be used for dimension reduction directly. Defaults to None.
genes: the list of genes that will be used to subset the data for dimension reduction and clustering. If `None`,
all genes will be used. Defaults to None.
layer: the layer that will be used to retrieve data for dimension reduction and clustering. If `None`, .X is
used. Defaults to None.
basis: the space that will be used for clustering. Valid names includes, for example, `pca`, `umap`,
`velocity_pca` (that is, you can use velocity for clustering), etc. Defaults to "pca".
dims: the list of dimensions that will be selected for clustering. If `None`, all dimensions will be used.
Defaults to None.
n_pca_components: Number of input PCs (principle components) that will be used for further non-linear dimension
reduction. If n_pca_components is larger than the existing #PC in adata.obsm['X_pca'] or input layer's
corresponding pca space (layer_pca), pca will be rerun with n_pca_components PCs requested. Defaults to 30.
n_components: the dimension of the space to embed into. Defaults to 2.
n_neighbors: the number of nearest neighbors when constructing adjacency matrix. Defaults to 30.
reduction_method: Non-linear dimension reduction method to further reduce dimension based on the top
n_pca_components PCA components. Currently, PSL (probablistic structure learning, a new dimension reduction
by us), tSNE (fitsne instead of traditional tSNE used) or umap are supported. Defaults to "umap".
embedding_key: The str in .obsm that will be used as the key to save the reduced embedding space. By default it
is None and embedding key is set as layer + reduction_method. If layer is None, it will be "X_neighbors".
Defaults to None.
neighbor_key: The str in .uns that will be used as the key to save the nearest neighbor graph. By default it is
None and neighbor_key key is set as layer + "_neighbors". If layer is None, it will be "X_neighbors".
Defaults to None.
enforce: whether to re-perform dimension reduction even if there is reduced basis in the AnnData object.
Defaults to False.
cores: the number of cores used for calculation. Used only when tSNE reduction_method is used. Defaults to 1.
copy: whether to return a copy of the AnnData object or update the object in place. Defaults to False.
kwargs: other kwargs that will be passed to umap.UMAP. for umap, min_dist is a noticeable kwargs that would
significantly influence the reduction result.
Returns:
An updated AnnData object updated with reduced dimension data for data from different layers, returned if `copy`
is true.
"""
logger = LoggerManager.gen_logger("dynamo-dimension-reduction")
logger.log_time()
adata = copy_adata(adata) if copy else adata
logger.info("retrieve data for non-linear dimension reduction...", indent_level=1)
if X_data is None:
X_data, n_components, basis = prepare_dim_reduction(
adata,
genes=genes,
layer=layer,
basis=basis,
dims=dims,
n_pca_components=n_pca_components,
n_components=n_components,
)
if basis[:2] + reduction_method in adata.obsm_keys():
has_basis = True
else:
has_basis = False
if has_basis and not enforce:
logger.warning(
f"adata already have basis {reduction_method}. dimension reduction {reduction_method} will be skipped! \n"
f"set enforce=True to re-performing dimension reduction."
)
if embedding_key is None:
embedding_key = "X_" + reduction_method if layer is None else layer + "_" + reduction_method
if neighbor_key is None:
neighbor_result_prefix = "" if layer is None else layer
conn_key, dist_key, neighbor_key = _gen_neighbor_keys(neighbor_result_prefix)
if enforce or not has_basis:
logger.info(f"[{reduction_method.upper()}] using {basis} with n_pca_components = {n_pca_components}", indent_level=1)
adata = run_reduce_dim(
adata,
X_data,
n_components,
n_pca_components,
reduction_method,
embedding_key,
n_neighbors,
neighbor_key,
cores,
**kwargs,
)
if neighbor_key not in adata.uns_keys():
neighbors(adata)
logger.finish_progress(progress_name=reduction_method.upper())
if copy:
return adata
return None
def run_umap(
adata: AnnData,
X_data: np.ndarray = None,
genes: Optional[List[str]] = None,
layer: Optional[str] = None,
basis: Optional[str] = "pca",
dims: Optional[List[int]] = None,
n_pca_components: int = 30,
n_components: int = 2,
n_neighbors: int = 30,
embedding_key: Optional[str] = None,
neighbor_key: Optional[str] = None,
enforce: bool = False,
cores: int = 1,
copy: bool = False,
min_dist: float = 0.5,
**kwargs,
) -> Optional[AnnData]:
"""Compute a low dimension reduction projection of an AnnData object first with PCA, followed by UMAP.
This is a wrap for reduce Dimension, with the important min_dist value specified straightforwardly.
Args:
adata: an AnnData object.
X_data: the user supplied data that will be used for dimension reduction directly. Defaults to None.
genes: the list of genes that will be used to subset the data for dimension reduction and clustering. If `None`,
all genes will be used. Defaults to None.
layer: the layer that will be used to retrieve data for dimension reduction and clustering. If `None`, .X is
used. Defaults to None.
basis: the space that will be used for clustering. Valid names includes, for example, `pca`, `umap`,
`velocity_pca` (that is, you can use velocity for clustering), etc. Defaults to "pca".
dims: the list of dimensions that will be selected for clustering. If `None`, all dimensions will be used.
Defaults to None.
n_pca_components: Number of input PCs (principle components) that will be used for further non-linear dimension
reduction. If n_pca_components is larger than the existing #PC in adata.obsm['X_pca'] or input layer's
corresponding pca space (layer_pca), pca will be rerun with n_pca_components PCs requested. Defaults to 30.
n_components: the dimension of the space to embed into. Defaults to 2.
n_neighbors: the number of nearest neighbors when constructing adjacency matrix. Defaults to 30.
embedding_key: The str in .obsm that will be used as the key to save the reduced embedding space. By default it
is None and embedding key is set as layer + reduction_method. If layer is None, it will be "X_neighbors".
Defaults to None.
neighbor_key: The str in .uns that will be used as the key to save the nearest neighbor graph. By default it is
None and neighbor_key key is set as layer + "_neighbors". If layer is None, it will be "X_neighbors".
Defaults to None.
enforce: whether to re-perform dimension reduction even if there is reduced basis in the AnnData object.
Defaults to False.
cores: the number of cores used for calculation. Used only when tSNE reduction_method is used. Defaults to 1.
copy: whether to return a copy of the AnnData object or update the object in place. Defaults to False.
min_dist: the min_dist arg passed to umap.UMAP.
kwargs: other kwargs that will be passed to umap.UMAP. for umap, min_dist is a noticeable kwargs that would
significantly influence the reduction result.
Returns:
An updated AnnData object updated with reduced dimension data for data from different layers, returned if `copy`
is true.
"""
kwargs = update_dict(kwargs, {"min_dist": min_dist})
return reduceDimension(
adata=adata,
X_data=X_data,
genes=genes,
layer=layer,
basis=basis,
dims=dims,
n_pca_components=n_pca_components,
n_components=n_components,
n_neighbors=n_neighbors,
reduction_method="umap",
embedding_key=embedding_key,
neighbor_key=neighbor_key,
enforce=enforce,
cores=cores,
copy=copy,
**kwargs,
)
# @docstrings.with_indent(4)
# def run_umap(X,
# n_neighbors=30,
# n_components=2,
# metric="euclidean",
# min_dist=0.1,
# spread=1.0,
# n_epochs=None,
# alpha=1.0,
# gamma=1.0,
# negative_sample_rate=5,
# init_pos='spectral',
# random_state=0,
# verbose=False, **umap_kwargs):
# """Perform umap analysis.
#
# Parameters
# ----------
# %(umap_ann.parameters)s
#
# Returns
# -------
# graph, knn_indices, knn_dists, embedding_, mapper
# A tuple of kNN graph (`graph`), indices of nearest neighbors of each cell (knn_indicies), distances of
# nearest
# neighbors (knn_dists), the low dimensional embedding (embedding_) and finally the fit mapper from umap
# which
# can be used to transform new high dimensional data to low dimensional space or perofrm inverse transform
# of
# new data back to high dimension.
# """
#
# _umap_kwargs={"angular_rp_forest": False, "local_connectivity": 1.0, "metric_kwds": None,
# "set_op_mix_ratio": 1.0, "target_metric": 'categorical', "target_metric_kwds": None,
# "target_n_neighbors": -1, "target_weight": 0.5, "transform_queue_size": 4.0,
# "transform_seed": 42}
# umap_kwargs = update_dict(_umap_kwargs, umap_kwargs)
#
# mapper = umap.UMAP(n_neighbors=n_neighbors,
# n_components=n_components,
# metric=metric,
# min_dist=min_dist,
# spread=spread,
# n_epochs=n_epochs,
# learning_rate=alpha,
# repulsion_strength=gamma,
# negative_sample_rate=negative_sample_rate,
# init=init_pos,
# random_state = random_state,
# verbose=verbose,
# **umap_kwargs
# ).fit(X)
#
# dmat = pairwise_distances(X, metric=metric)
# graph = fuzzy_simplicial_set(
# X=dmat,
# n_neighbors=n_neighbors,
# random_state=random_state,
# metric="precomputed",
# verbose=verbose
# )
# # extract knn_indices, knn_dist
# g_tmp = deepcopy(graph)
# g_tmp[graph.nonzero()] = dmat[graph.nonzero()]
# knn_indices, knn_dists = extract_indices_dist_from_graph(g_tmp, n_neighbors=n_neighbors)
#
# knn_indices, knn_dists = extract_indices_dist_from_graph(mapper.graph_, n_neighbors=n_neighbors)
#
# return mapper.graph_, knn_dists, knn_indices, mapper.transform(X), mapper