Source code for dynamo.preprocessing.dynast

from typing import Union

import anndata
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, issparse
from sklearn.utils import sparsefuncs

from ..dynamo_logger import LoggerManager, main_tqdm
from ..utils import copy_adata


[docs]def lambda_correction(
    adata: anndata.AnnData,
    lambda_key: str = "lambda",
    inplace: bool = True,
    copy: bool = False,
) -> Union[anndata.AnnData, None]:
    """Use lambda (cell-wise detection rate) to estimate the labelled RNA.

    Args:
        adata: An adata object generated from dynast.
        lambda_key: The key to the cell-wise detection rate. Defaults to "lambda".
        inplace: Whether to inplace update the layers. If False, new layers that append '_corrected" to the existing
            will be used to store the updated data. Defaults to True.
        copy: Whether to copy the adata object or update adata object inplace. Defaults to False.

    Raises:
        ValueError: The `lambda_key` cannot be found in `adata.obs`.
        ValueError: The adata object has to include labeling layers.
        ValueError: `data_type` is set to 'splicing_labeling' but the existing layers in the adata object don't meet the
            requirements.
        ValueError: `data_type` is set to 'labeling' but the existing layers in the adata object don't meet the
            requirements.

    Returns:
        A new AnnData object that are updated with lambda corrected layers if `copy` is true. Otherwise, return None.
    """

    logger = LoggerManager.gen_logger("dynamo-lambda_correction")
    logger.log_time()

    adata = copy_adata(adata) if copy else adata

    logger.info("apply detection rate correction to adata...", indent_level=1)

    if lambda_key not in adata.obs.keys():
        raise ValueError(
            f"the lambda_key {lambda_key} is not included in adata.obs! Please ensure you have calculated "
            "per-cell detection rate!"
        )

    logger.info("retrieving the cell-wise detection rate..", indent_level=1)
    detection_rate = adata.obs[lambda_key].values[:, None]

    logger.info("identify the data type..", indent_level=1)
    all_layers = adata.layers.keys()

    has_ul = np.any(["ul_" in i for i in all_layers])
    has_un = np.any(["un_" in i for i in all_layers])
    has_sl = np.any(["sl_" in i for i in all_layers])
    has_sn = np.any(["sn_" in i for i in all_layers])

    has_l = np.any(["_l_" in i for i in all_layers])
    has_n = np.any(["_n_" in i for i in all_layers])

    if np.count_nonzero([has_ul, has_un, has_sl, has_sn]) == 4:
        datatype = "splicing_labeling"
    elif np.count_nonzero([has_l, has_n]):
        datatype = "labeling"
    else:
        raise ValueError(
            "the adata object has to include labeling layers."
        )

    logger.info(f"the data type identified is {datatype}", indent_level=2)

    logger.info("retrieve relevant layers for detection rate correction", indent_level=1)
    if datatype == "splicing_labeling":
        layers, match_tot_layer = [], []
        for layer in all_layers:
            if "ul_" in layer:
                layers.append(layer)
                match_tot_layer.append("unspliced")
            elif "un_" in layer:
                layers.append(layer)
                match_tot_layer.append("unspliced")
            elif "sl_" in layer:
                layers.append(layer)
                match_tot_layer.append("spliced")
            elif "sn_" in layer:
                layers.append(layer)
                match_tot_layer.append("spliced")
            elif "spliced" in layer:
                layers.append(layer)
            elif "unspliced" in layer:
                layers.append(layer)

        if len(layers) != 6:
            raise ValueError(
                "the adata object has to include ul, un, sl, sn, unspliced, spliced, "
                "six relevant layers for splicing and labeling quantified datasets."
            )
    elif datatype == "labeling":
        layers, match_tot_layer = [], []
        for layer in all_layers:
            if "_l_" in layer:
                layers.append(layer)
                match_tot_layer.append("total")
            elif "_n_" in layer:
                layers.append(layer)
                match_tot_layer.append("total")
            elif "total" in layer:
                layers.append(layer)

        if len(layers) != 3:
            raise ValueError(
                "the adata object has to include labeled, unlabeled, three relevant layers for labeling quantified "
                "datasets."
            )

    logger.info("detection rate correction starts", indent_level=1)
    for i, layer in enumerate(main_tqdm(layers, desc="iterating all relevant layers")):
        if i < len(match_tot_layer):
            cur_layer = adata.layers[layer] if inplace else adata.layers[layer].copy()
            cur_total = adata.layers[match_tot_layer[i]]

            # even layers is labeled RNA and odd unlabeled RNA
            if i % 2 == 0:
                # formula: min(L / lambda, (L + U)) from scNT-seq
                if issparse(cur_layer):
                    sparsefuncs.inplace_row_scale(cur_layer, 1 / detection_rate)
                else:
                    cur_layer /= detection_rate
                if inplace:
                    adata.layers[layer] = sparse_mimmax(cur_layer, cur_total)
                else:
                    adata.layers[layer + "_corrected"] = sparse_mimmax(cur_layer, cur_total)

            else:
                if inplace:
                    adata.layers[layer] = cur_total - adata.layers[layers[i - 1]]
                else:
                    adata.layers[layer + "_corrected"] = cur_total - adata.layers[layers[i - 1]]

    logger.finish_progress(progress_name="lambda_correction")

    if copy:
        return adata
    return None


def sparse_mimmax(A: csr_matrix, B: csr_matrix, type="min") -> csr_matrix:
    """Return the element-wise minimum/maximum of sparse matrices `A` and `B`.

    Args:
        A: The first sparse matrix.
        B: The second sparse matrix.
        type: The type of calculation, either "min" or "max". Defaults to "min".

    Returns:
        A sparse matrix that contain the element-wise maximal or minimal of two sparse matrices.
    """

    AgtB = (A < B).astype(int) if type == "min" else (A > B).astype(int)
    M = np.multiply(AgtB, A - B) + B

    return M