Source code for dynamo.tools.dimension_reduction

from typing import List, Optional

import numpy as np
from anndata import AnnData

from ..dynamo_logger import LoggerManager
from ..utils import copy_adata
from .connectivity import generate_neighbor_keys, neighbors
from .utils import update_dict
from .utils_reduceDimension import prepare_dim_reduction, run_reduce_dim


[docs]def reduceDimension(
    adata: AnnData,
    X_data: np.ndarray = None,
    genes: Optional[List[str]] = None,
    layer: Optional[str] = None,
    basis: Optional[str] = "pca",
    dims: Optional[List[int]] = None,
    n_pca_components: int = 30,
    n_components: int = 2,
    n_neighbors: int = 30,
    reduction_method: str = "umap",
    embedding_key: Optional[str] = None,
    neighbor_key: Optional[str] = None,
    enforce: bool = False,
    cores: int = 1,
    copy: bool = False,
    **kwargs,
) -> Optional[AnnData]:
    """Compute a low dimension reduction projection of an AnnData object first with PCA, followed by non-linear
    dimension reduction methods

    Args:
        adata: An AnnData object.
        X_data: The user supplied data that will be used for dimension reduction directly. Defaults to None.
        genes: The list of genes that will be used to subset the data for dimension reduction and clustering. If `None`,
            all genes will be used. Defaults to None.
        layer: The layer that will be used to retrieve data for dimension reduction and clustering. If `None`, .X is
            used. Defaults to None.
        basis: The space that will be used for clustering. Valid names includes, for example, `pca`, `umap`,
            `velocity_pca` (that is, you can use velocity for clustering), etc. Defaults to "pca".
        dims: The list of dimensions that will be selected for clustering. If `None`, all dimensions will be used.
            Defaults to None.
        n_pca_components: Number of input PCs (principal components) that will be used for further non-linear dimension
            reduction. If n_pca_components is larger than the existing #PC in adata.obsm['X_pca'] or input layer's
            corresponding pca space (layer_pca), pca will be rerun with n_pca_components PCs requested. Defaults to 30.
        n_components: The dimension of the space to embed into. Defaults to 2.
        n_neighbors: The number of nearest neighbors when constructing adjacency matrix. Defaults to 30.
        reduction_method: Non-linear dimension reduction method to further reduce dimension based on the top
            n_pca_components PCA components. Currently, PSL (probabilistic structure learning, a new dimension reduction
            by us), tSNE (fitsne instead of traditional tSNE used) or umap are supported. Defaults to "umap".
        embedding_key: The str in `.obsm` that will be used as the key to save the reduced embedding space. By default,
            it is None and embedding key is set as layer + reduction_method. If layer is None, it will be "X_neighbors".
            Defaults to None.
        neighbor_key: The str in .uns that will be used as the key to save the nearest neighbor graph. By default it is
            None and neighbor_key key is set as layer + "_neighbors". If layer is None, it will be "X_neighbors".
            Defaults to None.
        enforce: Whether to re-perform dimension reduction even if there is reduced basis in the AnnData object.
            Defaults to False.
        cores: The number of cores used for calculation. Used only when tSNE reduction_method is used. Defaults to 1.
        copy: Whether to return a copy of the AnnData object or update the object in place. Defaults to False.
        kwargs: Other kwargs that will be passed to umap.UMAP. for umap, min_dist is a noticeable kwargs that would
            significantly influence the reduction result.

    Returns:
        An updated AnnData object updated with reduced dimension data for data from different layers, returned if `copy`
        is true.
    """

    logger = LoggerManager.gen_logger("dynamo-dimension-reduction")
    logger.log_time()

    adata = copy_adata(adata) if copy else adata

    logger.info("retrieve data for non-linear dimension reduction...", indent_level=1)

    if X_data is None:
        X_data, n_components, basis = prepare_dim_reduction(
            adata,
            genes=genes,
            layer=layer,
            basis=basis,
            dims=dims,
            n_pca_components=n_pca_components,
            n_components=n_components,
        )
    if basis[:2] + reduction_method in adata.obsm_keys():
        has_basis = True
    else:
        has_basis = False

    if has_basis and not enforce:
        logger.warning(
            f"adata already have basis {reduction_method}. dimension reduction {reduction_method} will be skipped! \n"
            f"set enforce=True to re-performing dimension reduction."
        )

    if embedding_key is None:
        embedding_key = "X_" + reduction_method if layer is None else layer + "_" + reduction_method
    if neighbor_key is None:
        neighbor_result_prefix = "" if layer is None else layer
        conn_key, dist_key, neighbor_key = generate_neighbor_keys(neighbor_result_prefix)

    if enforce or not has_basis:
        logger.info(f"[{reduction_method.upper()}] using {basis} with n_pca_components = {n_pca_components}", indent_level=1)
        adata = run_reduce_dim(
            adata,
            X_data,
            n_components,
            n_pca_components,
            reduction_method,
            embedding_key,
            n_neighbors,
            neighbor_key,
            cores,
            **kwargs,
        )
    if neighbor_key not in adata.uns_keys():
        neighbors(adata)

    logger.finish_progress(progress_name=reduction_method.upper())

    if copy:
        return adata
    return None


def run_umap(
    adata: AnnData,
    X_data: np.ndarray = None,
    genes: Optional[List[str]] = None,
    layer: Optional[str] = None,
    basis: Optional[str] = "pca",
    dims: Optional[List[int]] = None,
    n_pca_components: int = 30,
    n_components: int = 2,
    n_neighbors: int = 30,
    embedding_key: Optional[str] = None,
    neighbor_key: Optional[str] = None,
    enforce: bool = False,
    cores: int = 1,
    copy: bool = False,
    min_dist: float = 0.5,
    **kwargs,
) -> Optional[AnnData]:
    """Compute a low dimension reduction projection of an AnnData object first with PCA, followed by UMAP.

    This is a wrap for reduce Dimension, with the important min_dist value specified straightforwardly.

    Args:
        adata: An AnnData object.
        X_data: The user supplied data that will be used for dimension reduction directly. Defaults to None.
        genes: The list of genes that will be used to subset the data for dimension reduction and clustering. If `None`,
            all genes will be used. Defaults to None.
        layer: The layer that will be used to retrieve data for dimension reduction and clustering. If `None`, .X is
            used. Defaults to None.
        basis: The space that will be used for clustering. Valid names includes, for example, `pca`, `umap`,
            `velocity_pca` (that is, you can use velocity for clustering), etc. Defaults to "pca".
        dims: The list of dimensions that will be selected for clustering. If `None`, all dimensions will be used.
            Defaults to None.
        n_pca_components: Number of input PCs (principal components) that will be used for further non-linear dimension
            reduction. If n_pca_components is larger than the existing #PC in adata.obsm['X_pca'] or input layer's
            corresponding pca space (layer_pca), pca will be rerun with n_pca_components PCs requested. Defaults to 30.
        n_components: The dimension of the space to embed into. Defaults to 2.
        n_neighbors: The number of nearest neighbors when constructing adjacency matrix. Defaults to 30.
        embedding_key: The str in .obsm that will be used as the key to save the reduced embedding space. By default, it
            is None and embedding key is set as layer + reduction_method. If layer is None, it will be "X_neighbors".
            Defaults to None.
        neighbor_key: The str in .uns that will be used as the key to save the nearest neighbor graph. By default, it is
            None and neighbor_key key is set as layer + "_neighbors". If layer is None, it will be "X_neighbors".
            Defaults to None.
        enforce: Whether to re-perform dimension reduction even if there is reduced basis in the AnnData object.
            Defaults to False.
        cores: The number of cores used for calculation. Used only when tSNE reduction_method is used. Defaults to 1.
        copy: Whether to return a copy of the AnnData object or update the object in place. Defaults to False.
        min_dist: The min_dist arg passed to umap.UMAP.
        kwargs: Other kwargs that will be passed to umap.UMAP. for umap, min_dist is a noticeable kwargs that would
            significantly influence the reduction result.

    Returns:
        An updated AnnData object updated with reduced dimension data for data from different layers, returned if `copy`
        is true.
    """
    kwargs = update_dict(kwargs, {"min_dist": min_dist})
    return reduceDimension(
        adata=adata,
        X_data=X_data,
        genes=genes,
        layer=layer,
        basis=basis,
        dims=dims,
        n_pca_components=n_pca_components,
        n_components=n_components,
        n_neighbors=n_neighbors,
        reduction_method="umap",
        embedding_key=embedding_key,
        neighbor_key=neighbor_key,
        enforce=enforce,
        cores=cores,
        copy=copy,
        **kwargs,
    )


# @docstrings.with_indent(4)
# def run_umap(X,
#         n_neighbors=30,
#         n_components=2,
#         metric="euclidean",
#         min_dist=0.1,
#         spread=1.0,
#         n_epochs=None,
#         alpha=1.0,
#         gamma=1.0,
#         negative_sample_rate=5,
#         init_pos='spectral',
#         random_state=0,
#         verbose=False, **umap_kwargs):
#     """Perform umap analysis.
#
#     Parameters
#     ----------
#     %(umap_ann.parameters)s
#
#     Returns
#     -------
#         graph, knn_indices, knn_dists, embedding_, mapper
#             A tuple of kNN graph (`graph`), indices of nearest neighbors of each cell (knn_indicies), distances of
#             nearest
#             neighbors (knn_dists), the low dimensional embedding (embedding_) and finally the fit mapper from umap
#             which
#             can be used to transform new high dimensional data to low dimensional space or perofrm inverse transform
#             of
#             new data back to high dimension.
#     """
#
#     _umap_kwargs={"angular_rp_forest": False,  "local_connectivity": 1.0, "metric_kwds": None,
#                  "set_op_mix_ratio": 1.0, "target_metric": 'categorical', "target_metric_kwds": None,
#                  "target_n_neighbors": -1, "target_weight": 0.5, "transform_queue_size": 4.0,
#                  "transform_seed": 42}
#     umap_kwargs = update_dict(_umap_kwargs, umap_kwargs)
#
#     mapper = umap.UMAP(n_neighbors=n_neighbors,
#                        n_components=n_components,
#                        metric=metric,
#                        min_dist=min_dist,
#                        spread=spread,
#                        n_epochs=n_epochs,
#                        learning_rate=alpha,
#                        repulsion_strength=gamma,
#                        negative_sample_rate=negative_sample_rate,
#                        init=init_pos,
#                        random_state = random_state,
#                        verbose=verbose,
#                        **umap_kwargs
#     ).fit(X)
#
#     dmat = pairwise_distances(X, metric=metric)
#     graph = fuzzy_simplicial_set(
#         X=dmat,
#         n_neighbors=n_neighbors,
#         random_state=random_state,
#         metric="precomputed",
#         verbose=verbose
#     )
#     # extract knn_indices, knn_dist
#     g_tmp = deepcopy(graph)
#     g_tmp[graph.nonzero()] = dmat[graph.nonzero()]
#     knn_indices, knn_dists = extract_indices_dist_from_graph(g_tmp, n_neighbors=n_neighbors)
#
#     knn_indices, knn_dists = extract_indices_dist_from_graph(mapper.graph_, n_neighbors=n_neighbors)
#
#     return mapper.graph_, knn_dists, knn_indices, mapper.transform(X), mapper