Source code for dynamo.tools.velocyto_scvelo

# functions to run velocyto and scvelo
import numpy as np
import pandas as pd

# import velocyto as vcy
# import scvelo as scv
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from .moments import *
from anndata import AnnData


def vlm_to_adata(vlm, n_comps=30, basis="umap", trans_mats=None, cells_ixs=None):
    """ Conversion function from the velocyto world to the dynamo world.
		Code original from scSLAM-seq repository

    Parameters
    ----------
		vlm: VelocytoLoom Object
			The VelocytoLoom object that will be converted into adata.
		n_comps: `int` (default: 30)
			The number of pc components that will be stored.
		basis: `str` (default: `umap`)
			The embedding that will be used to store the vlm.ts attribute. Note that velocyto doesn't usually use
			umap as embedding although `umap` as set as default for the convenience of dynamo itself.
		trans_mats: None or dict
			A dict of all relevant transition matrices
		cell_ixs: list of int
			These are the indices of the subsampled cells

    Returns
    -------
		adata: AnnData object
	"""
    from collections import OrderedDict

    # set obs, var
    obs, var = pd.DataFrame(vlm.ca), pd.DataFrame(vlm.ra)
    if "CellID" in obs.keys():
        obs["obs_names"] = obs.pop("CellID")
    if "Gene" in var.keys():
        var["var_names"] = var.pop("Gene")

    if hasattr(vlm, "q"):
        var["gamma_b"] = vlm.q
    if hasattr(vlm, "gammas"):
        var["gamma"] = vlm.gammas
    if hasattr(vlm, "R2"):
        var["gamma_r2"] = vlm.R2

    # rename clusters to louvain
    try:
        ix = np.where(obs.columns == "Clusters")[0][0]
        obs_names = list(obs.columns)
        obs_names[ix] = "louvain"
        obs.columns = obs_names

        # make louvain a categorical field
        obs["louvain"] = pd.Categorical(obs["louvain"])
    except:
        print("Could not find a filed 'Clusters' in vlm.ca.")

    # set layers basics
    layers = OrderedDict(
        unspliced=csr_matrix(vlm.U.T),
        spliced=csr_matrix(vlm.S.T),
        velocity_S=csr_matrix(vlm.velocity.T),
    )

    # set X_spliced / X_unspliced
    if hasattr(vlm, "S_norm"):
        layers["X_spliced"] = csr_matrix(2**vlm.S_norm - 1).T
    if hasattr(vlm, "U_norm"):
        layers["X_unspliced"] = csr_matrix(2**vlm.U_norm - 1).T
    if hasattr(vlm, "S_sz") and not hasattr(vlm, "S_norm"):
        layers["X_spliced"] = csr_matrix(vlm.S_sz).T
    if hasattr(vlm, "U_sz") and hasattr(vlm, "U_norm"):
        layers["X_unspliced"] = csr_matrix(vlm.U_sz).T

    # set M_s / M_u
    if hasattr(vlm, "Sx"):
        layers["M_s"] = csr_matrix(vlm.Sx).T
    if hasattr(vlm, "Ux"):
        layers["M_u"] = csr_matrix(vlm.Ux).T
    if hasattr(vlm, "Sx_sz") and not hasattr(vlm, "Sx"):
        layers["M_s"] = csr_matrix(vlm.Sx_sz).T
    if hasattr(vlm, "Ux_sz") and hasattr(vlm, "Ux"):
        layers["M_u"] = csr_matrix(vlm.Ux_sz).T

    # set obsm
    obsm = {}
    obsm["X"] = vlm.pcs[:, : min(n_comps, vlm.pcs.shape[1])]
    # set basis and velocity on the basis
    if basis is not None:
        obsm["X_" + basis] = vlm.ts
        obsm["velocity_" + basis] = vlm.delta_embedding

    # set transition matrix:
    uns = {}
    if hasattr(vlm, "corrcoef"):
        uns["transition_matrix"] = vlm.corrcoef
    if hasattr(vlm, "colorandum"):
        uns["louvain_colors"] = list(np.unique(vlm.colorandum))

    # add uns annotations
    if trans_mats is not None:
        for key, value in trans_mats.items():
            uns[key] = trans_mats[key]
    if cells_ixs is not None:
        uns["cell_ixs"] = cells_ixs
    if hasattr(vlm, "embedding_knn"):
        from .connectivity import adj_to_knn

        n_neighbors = np.unique((vlm.embedding_knn > 0).sum(1)).min()
        ind_mat, dist_mat = adj_to_knn(
            vlm.emedding_knn, n_neighbors
        )
        uns["neighbors"] = {"indices": ind_mat}
        obsp = {'distances': dist_mat, "connectivities": vlm.emedding_knn}

    uns["dynamics"] = {
        "filter_gene_mode": None,
        "t": None,
        "group": None,
        "X_data": None,
        "X_fit_data": None,
        "asspt_mRNA": "ss",
        "experiment_type": "conventional",
        "normalized": True,
        "model": "deterministic",
        "has_splicing": True,
        "has_labeling": False,
        "has_protein": False,
        "use_smoothed": True,
        "NTR_vel": False,
        "log_unnormalized": True,
    }

    # set X
    if hasattr(vlm, "S_norm"):
        X = csr_matrix(vlm.S_norm.T)
    else:
        X = csr_matrix(vlm.S_sz.T) if hasattr(vlm, "S_sz") else csr_matrix(vlm.S.T)

    # create an anndata object with Dynamo characteristics.
    dyn_adata = AnnData(X=X, obs=obs, obsp=obsp, obsm=obsm, var=var, layers=layers, uns=uns)

    return dyn_adata


[docs]def converter(data_in, from_type="adata", to_type="vlm", dir="."):
    """
	convert adata to loom object
	- we may save_fig to a temp directory automatically
	- we may write a on-the-fly converter which doesn't involve saving and reading files  
	"""
    if from_type == "adata":
        if to_type == "vlm":
            file = dir + "/data.loom"
            data_in.write_loom(file)
            data_out = vcy.VelocytoLoom(file)
    elif from_type == "vlm":
        if to_type == "adata":
            data_out = vlm_to_adata(vlm)

    data_out.ra["Gene"] = data_out.ra["var_names"]  # required by plot_phase_portraits
    colors20 = np.vstack(
        (
            plt.cm.tab20b(np.linspace(0.0, 1, 20))[::2],
            plt.cm.tab20c(np.linspace(0, 1, 20))[1::2],
        )
    )

    def colormap_fun(x: np.ndarray) -> np.ndarray:
        return colors20[np.mod(x, 20)]

    data_out.colorandum = colormap_fun([1] * data_out.S.shape[1])

    return data_out


def run_velocyto(adata):
    """
	1. convert adata to vlm data
	2. set up PCA, UMAP, etc.
	3. estimate the gamma parameter
	"""
    vlm = converter(adata)

    # U_norm: log2(U_sz + pcount)
    # vlm.U_sz: norm_factor * U
    # S_norm: log2(S_sz + pcount)
    # vlm.S_sz norm_factor * S
    # vlm.Ux: smoothed unspliced
    # vlm.Sx: smoothed spliced
    # vlm.Ux_sz: smoothed unspliced -- old code
    # vlm.Sx_sz: smoothed spliced -- old code

    vlm.normalize()  # add U_norm, U_sz, S_norm, S_sz
    vlm.perform_PCA()
    vlm.knn_imputation()  # Ux, Sx, Ux_sz, Sx_sz
    vlm.pcs = adata.X  # pcs: cell x npcs ndarray

    # vlm.Sx = vlm.S_sz
    # vlm.Ux = vlm.U_sz
    # vlm.Sx_sz = vlm.S_sz
    # vlm.Ux_sz = vlm.U_sz

    # gamma fit
    vlm.fit_gammas()  # limit_gamma = False, fit_offset = True,  use_imputed_data = False, use_size_norm = False

    # estimate velocity
    vlm.predict_U()
    vlm.calculate_velocity()

    # predict future state after dt
    vlm.calculate_shift()  # assumption = 'constant_velocity'
    vlm.extrapolate_cell_at_t()  # delta_t = 1.

    return vlm


def run_scvelo(adata):
    """
	1. set up PCA, UMAP, etc. 
	2. estimate gamma and all other parameters 
	3. return results (adata.var['velocity_gamma'])
	"""
    # scv.pp.filter_and_normalize(adata, min_counts=2, min_counts_u=1, n_top_genes=3000)
    scv.pp.moments(adata)  # , n_pcs = 12, n_neighbors = 15, mode = 'distances'
    scv.tl.velocity(adata)
    scv.tl.velocity_graph(adata)

    # how to fit other parameters, beta, etc.?

    return adata


def mean_var_by_time(X, Time):
    import pandas as pd

    exp_data = pd.DataFrame(X)
    exp_data["Time"] = Time

    mean_val = exp_data.groupby(["Time"]).mean()
    var_val = exp_data.groupby(["Time"]).var()

    return mean_val.values, var_val.values


def run_dynamo(adata, normalize=True, init_num=1, sample_method="lhs"):
    time = adata.obs["Step"].values
    uniqe_time = list(set(time))
    gene_num = adata.X.shape[1]

    # prepare data
    import numpy as np

    x_data = np.zeros((8, len(uniqe_time), gene_num))  # use unique time
    uu, ul, su, sl = (
        adata.layers["uu"].toarray(),
        adata.layers["ul"].toarray(),
        adata.layers["su"].toarray(),
        adata.layers["sl"].toarray(),
    )
    uu = np.log2(uu + 1) if normalize else uu
    ul = np.log2(ul + 1) if normalize else ul
    su = np.log2(su + 1) if normalize else su
    sl = np.log2(sl + 1) if normalize else sl

    x_data[0], x_data[4] = mean_var_by_time(uu, time)
    x_data[1], x_data[5] = mean_var_by_time(ul, time)
    x_data[2], x_data[6] = mean_var_by_time(su, time)
    x_data[3], x_data[7] = mean_var_by_time(sl, time)

    # estimation all parameters
    p0_range = {
        "a": [0, 1],
        "b": [0, 1],
        "la": [0, 1],
        "alpha_a": [10, 1000],
        "alpha_i": [0, 10],
        "sigma": [0, 1],
        "beta": [0, 10],
        "gamma": [0, 10],
    }

    estm = estimation(list(p0_range.values()))
    param_out = pd.DataFrame(
        index=adata.var.index,
        columns=["a", "b", "la", "alpha_a", "alpha_i", "sigma", "beta", "gamma"],
    )
    for i in range(gene_num):
        cur_x_data = x_data[:, :, i].squeeze()
        param_out.iloc[i, :], cost = estm.fit_lsq(
            uniqe_time, cur_x_data, p0=None, n_p0=init_num, sample_method=sample_method
        )

    # estimate only on the spliced and unspliced dataset

    # estimate on the labeled and unlabeled dataset

    # store the fitting result in adata.uns
    adata.uns.update({"dynamo": param_out})

    return adata


def run_dynamo_simple_fit(adata, log=True):
    ncells, gene_num = adata.X.shape

    # estimation all parameters
    param_out = pd.DataFrame(index=adata.var.index, columns=["alpha", "gamma"])

    u, s = adata.layers["unspliced"], adata.layers["spliced"]
    velocity_u, velocity_s = u, s
    for i in range(gene_num):
        cur_u, cur_s = u[:, i], s[:, i]
        gamma = fit_gamma(cur_u.toarray().squeeze(), cur_s.toarray().squeeze())
        alpha = np.mean(cur_s)

        velocity_u[:, i] = cur_u - cur_s * gamma
        velocity_s[:, i] = cur_s / (1 - np.exp(-1)) - cur_u
        param_out.iloc[i, :] = [alpha, gamma]

    adata.layers["velocity_u"] = velocity_u
    adata.layers["velocity_s"] = velocity_s
    adata.uns.update({"dynamo_simple_fit": param_out})

    return adata


def run_dynamo_labelling(adata, log=True, group=False):
    ncells, gene_num = adata.X.shape

    # estimation all parameters
    T = adata.obs["Time"]

    groups = [""] if group == False else np.unique(adata.obs[group])

    param_out = pd.DataFrame(
        index=adata.var.index,
        columns=[i + "_" + j for j in groups for i in ["alpha", "gamma", "u0", "l0"]],
    )
    L, U = adata.layers["L"], adata.layers["U"]
    velocity_u, velocity_s = L, U

    for i in range(gene_num):
        all_parm = []
        for cur_grp in groups.tolist():
            cur_L, cur_U = (
                (L[:, i], U[:, i])
                if cur_grp == ""
                else (
                    L[adata.obs[group] == cur_grp, i],
                    U[adata.obs[group] == cur_grp, i],
                )
            )
            if log:
                cur_U, cur_L = (
                    np.log(cur_U.toarray().squeeze() + 1),
                    np.log(cur_L.toarray().squeeze() + 1),
                )
            else:
                cur_U, cur_L = cur_U.toarray().squeeze(), cur_L.toarray().squeeze()

            gamma, l0 = fit_gamma_labelling(T, cur_L, mode=None)
            alpha, u0 = fit_alpha_labelling(T, cur_U, gamma, mode=None)
            tmp = [alpha, gamma, u0, l0]
            all_parm.extend(tmp)

            velocity_u[:, i] = (cur_L - cur_U * gamma)[:, None]
            velocity_s[:, i] = (cur_U / (1 - np.exp(-1)) - cur_L)[:, None]
            adata.layers[cur_grp + "velocity_u"] = velocity_u
            adata.layers[cur_grp + "velocity_s"] = velocity_s

        param_out.iloc[i, :] = all_parm

    adata.uns.update({"dynamo_labelling": param_out})

    return adata


def compare_res(
    adata,
    velocyto_res,
    svelo_res,
    dynamo_res,
    a_val,
    b_val,
    la_val,
    alpha_a_val,
    alpha_i_val,
    sigma_val,
    beta_val,
    gamma_val,
):
    """
	function to compare results from velocyto and scvelo with our new method
	0. retrieve gamm or gamma with other parameters from velocyto result or scvelo
	1. plot the correlation between parameters estimated with different methods
	2. calculate the correltion between those parameters
	"""
    # self._offset, self._offset2, self._beta, self._gamma, self._r2, self._velocity_genes

    velocyto_gammas = velocyto_res.gammas
    scvelo_gammas = svelo_res.var["velocity_gamma"]

    # scatter plot the true gammas with our result
    plt.subplots(figsize=(15, 5))
    plt.plot()
    plt.subplot(131)
    plt.plot(gamma_val, velocyto_gammas, "o")
    plt.xlabel(r"True $\gamma$")
    plt.ylabel(r"$\gamma$ (velocyto)")
    plt.subplot(132)
    plt.plot(gamma_val, scvelo_gammas, "o")
    plt.xlabel(r"True $\gamma$")
    plt.ylabel(r"$\gamma$ (scvelo)")
    plt.subplot(133)
    plt.plot(gamma_val, dynamo_res.uns["dynamo"]["gamma"], "o")
    plt.xlabel(r"True $\gamma$")
    plt.ylabel(r"$\gamma$ (dynamo)")

    # what if we only have a small number of parameters?
    plt.subplots(figsize=(15, 5))
    plt.plot()
    plt.subplot(131)
    plt.plot(alpha_a_val, svelo_res.var["fit_alpha"], "o")
    plt.xlabel(r"True alpha")
    plt.ylabel(r"$\alpha$ (scvelo)")
    plt.subplot(132)
    plt.plot(beta_val, svelo_res.var["fit_beta"], "o")
    plt.xlabel(r"True $\beta$")
    plt.ylabel(r"$\beta$ (scvelo)")
    plt.subplot(133)
    plt.plot(gamma_val, svelo_res.var["fit_gamma"], "o")
    plt.xlabel(r"True $\gamma$")
    plt.ylabel(r"$\gamma$ (scvelo)")

    #     param_out = pd.DataFrame(index=adata.var.index, columns=['a', 'b', 'la', 'alpha_a', 'alpha_i', 'sigma', 'beta', 'gamma'])
    # what if we only have a small number of parameters?
    plt.subplots(figsize=(15, 15))
    plt.subplot(331)
    plt.plot(a_val, adata.uns["dynamo"]["a"], "o")
    plt.xlabel(r"True $a$")
    plt.ylabel(r"$a$ (dynamo)")
    plt.subplot(332)
    plt.plot(b_val, adata.uns["dynamo"]["b"], "o")
    plt.xlabel(r"True $b$")
    plt.ylabel(r"$b$ (dynamo)")
    plt.subplot(333)
    plt.plot(la_val, adata.uns["dynamo"]["la"], "o")
    plt.xlabel(r"True $l_a$")
    plt.ylabel(r"$l_a$ (dynamo)")
    plt.subplot(334)
    plt.plot(alpha_a_val, adata.uns["dynamo"]["alpha_a"], "o")
    plt.xlabel(r"True $\alpha_a$")
    plt.ylabel(r"$\alpha_a$ (dynamo)")
    plt.subplot(335)
    plt.plot(alpha_i_val, adata.uns["dynamo"]["alpha_i"], "o")
    plt.xlabel(r"True $\alpha_i$")
    plt.ylabel(r"$\alpha_i$ (dynamo)")
    plt.subplot(336)
    plt.plot(sigma_val, adata.uns["dynamo"]["sigma"], "o")
    plt.xlabel(r"True $\sigma$")
    plt.ylabel(r"$\sigma$ (dynamo)")
    plt.subplot(337)
    plt.plot(beta_val, adata.uns["dynamo"]["beta"], "o")
    plt.xlabel(r"True $\beta$")
    plt.ylabel(r"$\beta$ (dynamo)")
    plt.subplot(338)
    plt.plot(gamma_val, adata.uns["dynamo"]["gamma"], "o")
    plt.xlabel(r"True $\gamma$")
    plt.ylabel(r"$\gamma$ (dynamo)")

    velocyto_coef = {"gamma": np.corrcoef(gamma_val, velocyto_gammas)[1, 0]}
    scvelo_coef = {
        "alpha": np.corrcoef(alpha_a_val, svelo_res.var["fit_alpha"])[1, 0],
        "beta": np.corrcoef(beta_val, svelo_res.var["fit_beta"])[1, 0],
        "gamma": np.corrcoef(gamma_val, svelo_res.var["fit_gamma"])[1, 0],
    }

    dynamo_coef = {
        "a": np.corrcoef(a_val, list(dynamo_res.uns["dynamo"]["a"]))[1, 0],
        "b": np.corrcoef(b_val, list(dynamo_res.uns["dynamo"]["b"]))[1, 0],
        "la": np.corrcoef(la_val, list(dynamo_res.uns["dynamo"]["la"]))[1, 0],
        "alpha_a": np.corrcoef(alpha_a_val, list(dynamo_res.uns["dynamo"]["alpha_a"]))[
            1, 0
        ],
        "alpha_i": np.corrcoef(alpha_i_val, list(dynamo_res.uns["dynamo"]["alpha_i"]))[
            1, 0
        ],
        "sigma": np.corrcoef(sigma_val, list(dynamo_res.uns["dynamo"]["sigma"]))[1, 0],
        "beta": np.corrcoef(beta_val, list(dynamo_res.uns["dynamo"]["beta"]))[1, 0],
        "gamma": np.corrcoef(gamma_val, list(dynamo_res.uns["dynamo"]["gamma"]))[1, 0],
    }

    return {
        "velocyto": pd.DataFrame.from_dict(velocyto_coef, orient="index").T,
        "scvelo": pd.DataFrame.from_dict(scvelo_coef, orient="index").T,
        "dynamo": pd.DataFrame.from_dict(dynamo_coef, orient="index").T,
    }