Source code for dynamo.tools.velocyto_scvelo

# functions to run velocyto and scvelo
import numpy as np
import pandas as pd

# import velocyto as vcy
# import scvelo as scv
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from .moments import *
import anndata


def vlm_to_adata(vlm, n_comps=30, basis="umap", trans_mats=None, cells_ixs=None):
    """Conversion function from the velocyto world to the dynamo world.
    Code original from scSLAM-seq repository

    Parameters
    ----------
        vlm: VelocytoLoom Object
            The VelocytoLoom object that will be converted into adata.
        n_comps: `int` (default: 30)
            The number of pc components that will be stored.
        basis: `str` (default: `umap`)
            The embedding that will be used to store the vlm.ts attribute. Note that velocyto doesn't usually use
            umap as embedding although `umap` as set as default for the convenience of dynamo itself.
        trans_mats: None or dict
            A dict of all relevant transition matrices
        cell_ixs: list of int
            These are the indices of the subsampled cells

    Returns
    -------
        adata: :class:`~anndata.AnnData`
            AnnData object
    """

    from collections import OrderedDict

    # set obs, var
    obs, var = pd.DataFrame(vlm.ca), pd.DataFrame(vlm.ra)
    if "CellID" in obs.keys():
        obs["obs_names"] = obs.pop("CellID")
    if "Gene" in var.keys():
        var["var_names"] = var.pop("Gene")

    if hasattr(vlm, "q"):
        var["gamma_b"] = vlm.q
    if hasattr(vlm, "gammas"):
        var["gamma"] = vlm.gammas
    if hasattr(vlm, "R2"):
        var["gamma_r2"] = vlm.R2

    # rename clusters to louvain
    try:
        ix = np.where(obs.columns == "Clusters")[0][0]
        obs_names = list(obs.columns)
        obs_names[ix] = "louvain"
        obs.columns = obs_names

        # make louvain a categorical field
        obs["louvain"] = pd.Categorical(obs["louvain"])
    except:
        print("Could not find a filed 'Clusters' in vlm.ca.")

    # set layers basics
    layers = OrderedDict(
        unspliced=csr_matrix(vlm.U.T),
        spliced=csr_matrix(vlm.S.T),
        velocity_S=csr_matrix(vlm.velocity.T),
    )

    # set X_spliced / X_unspliced
    if hasattr(vlm, "S_norm"):
        layers["X_spliced"] = csr_matrix(2 ** vlm.S_norm - 1).T
    if hasattr(vlm, "U_norm"):
        layers["X_unspliced"] = csr_matrix(2 ** vlm.U_norm - 1).T
    if hasattr(vlm, "S_sz") and not hasattr(vlm, "S_norm"):
        layers["X_spliced"] = csr_matrix(vlm.S_sz).T
    if hasattr(vlm, "U_sz") and hasattr(vlm, "U_norm"):
        layers["X_unspliced"] = csr_matrix(vlm.U_sz).T

    # set M_s / M_u
    if hasattr(vlm, "Sx"):
        layers["M_s"] = csr_matrix(vlm.Sx).T
    if hasattr(vlm, "Ux"):
        layers["M_u"] = csr_matrix(vlm.Ux).T
    if hasattr(vlm, "Sx_sz") and not hasattr(vlm, "Sx"):
        layers["M_s"] = csr_matrix(vlm.Sx_sz).T
    if hasattr(vlm, "Ux_sz") and hasattr(vlm, "Ux"):
        layers["M_u"] = csr_matrix(vlm.Ux_sz).T

    # set obsm
    obsm = {}
    obsm["X"] = vlm.pcs[:, : min(n_comps, vlm.pcs.shape[1])]
    # set basis and velocity on the basis
    if basis is not None:
        obsm["X_" + basis] = vlm.ts
        obsm["velocity_" + basis] = vlm.delta_embedding

    # set transition matrix:
    uns = {}
    if hasattr(vlm, "corrcoef"):
        uns["transition_matrix"] = vlm.corrcoef
    if hasattr(vlm, "colorandum"):
        uns["louvain_colors"] = list(np.unique(vlm.colorandum))

    # add uns annotations
    if trans_mats is not None:
        for key, value in trans_mats.items():
            uns[key] = trans_mats[key]
    if cells_ixs is not None:
        uns["cell_ixs"] = cells_ixs
    if hasattr(vlm, "embedding_knn"):
        from .connectivity import adj_to_knn

        n_neighbors = np.unique((vlm.embedding_knn > 0).sum(1)).min()
        ind_mat, dist_mat = adj_to_knn(vlm.emedding_knn, n_neighbors)
        uns["neighbors"] = {"indices": ind_mat}
        obsp = {"distances": dist_mat, "connectivities": vlm.emedding_knn}

    uns["dynamics"] = {
        "filter_gene_mode": None,
        "t": None,
        "group": None,
        "X_data": None,
        "X_fit_data": None,
        "asspt_mRNA": "ss",
        "experiment_type": "conventional",
        "normalized": True,
        "model": "deterministic",
        "est_method": "ols",
        "has_splicing": True,
        "has_labeling": False,
        "has_protein": False,
        "use_smoothed": True,
        "NTR_vel": False,
        "log_unnormalized": True,
        "fraction_for_deg": False,
    }

    # set X
    if hasattr(vlm, "S_norm"):
        X = csr_matrix(vlm.S_norm.T)
    else:
        X = csr_matrix(vlm.S_sz.T) if hasattr(vlm, "S_sz") else csr_matrix(vlm.S.T)

    # create an anndata object with Dynamo characteristics.
    dyn_adata = anndata.AnnData(X=X, obs=obs, obsp=obsp, obsm=obsm, var=var, layers=layers, uns=uns)

    return dyn_adata


[docs]def converter(data_in, from_type="adata", to_type="vlm", dir="."): """ convert adata to loom object - we may save_fig to a temp directory automatically - we may write a on-the-fly converter which doesn't involve saving and reading files """ if from_type == "adata": if to_type == "vlm": file = dir + "/data.loom" data_in.write_loom(file) data_out = vcy.VelocytoLoom(file) elif from_type == "vlm": if to_type == "adata": data_out = vlm_to_adata(vlm) # required by plot_phase_portraits data_out.ra["Gene"] = data_out.ra["var_names"] colors20 = np.vstack( ( plt.cm.tab20b(np.linspace(0.0, 1, 20))[::2], plt.cm.tab20c(np.linspace(0, 1, 20))[1::2], ) ) def colormap_fun(x: np.ndarray) -> np.ndarray: return colors20[np.mod(x, 20)] data_out.colorandum = colormap_fun([1] * data_out.S.shape[1]) return data_out
def run_velocyto(adata): """ 1. convert adata to vlm data 2. set up PCA, UMAP, etc. 3. estimate the gamma parameter """ vlm = converter(adata) # U_norm: log2(U_sz + pcount) # vlm.U_sz: norm_factor * U # S_norm: log2(S_sz + pcount) # vlm.S_sz norm_factor * S # vlm.Ux: smoothed unspliced # vlm.Sx: smoothed spliced # vlm.Ux_sz: smoothed unspliced -- old code # vlm.Sx_sz: smoothed spliced -- old code vlm.normalize() # add U_norm, U_sz, S_norm, S_sz vlm.perform_PCA() vlm.knn_imputation() # Ux, Sx, Ux_sz, Sx_sz vlm.pcs = adata.X # pcs: cell x npcs ndarray # vlm.Sx = vlm.S_sz # vlm.Ux = vlm.U_sz # vlm.Sx_sz = vlm.S_sz # vlm.Ux_sz = vlm.U_sz # gamma fit vlm.fit_gammas() # limit_gamma = False, fit_offset = True, use_imputed_data = False, use_size_norm = False # estimate velocity vlm.predict_U() vlm.calculate_velocity() # predict future state after dt vlm.calculate_shift() # assumption = 'constant_velocity' vlm.extrapolate_cell_at_t() # delta_t = 1. return vlm def run_scvelo(adata): """ 1. set up PCA, UMAP, etc. 2. estimate gamma and all other parameters 3. return results (adata.var['velocity_gamma']) """ # scv.pp.filter_and_normalize(adata, min_counts=2, min_counts_u=1, n_top_genes=3000) scv.pp.moments(adata) # , n_pcs = 12, n_neighbors = 15, mode = 'distances' scv.tl.velocity(adata) scv.tl.velocity_graph(adata) # how to fit other parameters, beta, etc.? return adata def mean_var_by_time(X, Time): import pandas as pd exp_data = pd.DataFrame(X) exp_data["Time"] = Time mean_val = exp_data.groupby(["Time"]).mean() var_val = exp_data.groupby(["Time"]).var() return mean_val.values, var_val.values def run_dynamo_deprecated(adata, normalize=True, init_num=1, sample_method="lhs"): time = adata.obs["Step"].values uniqe_time = list(set(time)) gene_num = adata.X.shape[1] # prepare data import numpy as np x_data = np.zeros((8, len(uniqe_time), gene_num)) # use unique time uu, ul, su, sl = ( adata.layers["uu"].toarray(), adata.layers["ul"].toarray(), adata.layers["su"].toarray(), adata.layers["sl"].toarray(), ) uu = np.log2(uu + 1) if normalize else uu ul = np.log2(ul + 1) if normalize else ul su = np.log2(su + 1) if normalize else su sl = np.log2(sl + 1) if normalize else sl x_data[0], x_data[4] = mean_var_by_time(uu, time) x_data[1], x_data[5] = mean_var_by_time(ul, time) x_data[2], x_data[6] = mean_var_by_time(su, time) x_data[3], x_data[7] = mean_var_by_time(sl, time) # estimation all parameters p0_range = { "a": [0, 1], "b": [0, 1], "la": [0, 1], "alpha_a": [10, 1000], "alpha_i": [0, 10], "sigma": [0, 1], "beta": [0, 10], "gamma": [0, 10], } estm = estimation(list(p0_range.values())) param_out = pd.DataFrame( index=adata.var.index, columns=[ "a", "b", "la", "alpha_a", "alpha_i", "sigma", "beta", "gamma", ], ) for i in range(gene_num): cur_x_data = x_data[:, :, i].squeeze() param_out.iloc[i, :], cost = estm.fit_lsq( uniqe_time, cur_x_data, p0=None, n_p0=init_num, sample_method=sample_method, ) # estimate only on the spliced and unspliced dataset # estimate on the labeled and unlabeled dataset # store the fitting result in adata.uns adata.uns.update({"dynamo": param_out}) return adata def run_dynamo_simple_fit_deprecated(adata, log=True): ncells, gene_num = adata.X.shape # estimation all parameters param_out = pd.DataFrame(index=adata.var.index, columns=["alpha", "gamma"]) u, s = adata.layers["unspliced"], adata.layers["spliced"] velocity_u, velocity_s = u, s for i in range(gene_num): cur_u, cur_s = u[:, i], s[:, i] gamma = fit_gamma(cur_u.toarray().squeeze(), cur_s.toarray().squeeze()) alpha = np.mean(cur_s) velocity_u[:, i] = cur_u - cur_s * gamma velocity_s[:, i] = cur_s / (1 - np.exp(-1)) - cur_u param_out.iloc[i, :] = [alpha, gamma] adata.layers["velocity_u"] = velocity_u adata.layers["velocity_s"] = velocity_s adata.uns.update({"dynamo_simple_fit": param_out}) return adata def run_dynamo_labelling_deprecated(adata, log=True, group=False): ncells, gene_num = adata.X.shape # estimation all parameters T = adata.obs["Time"] groups = [""] if group == False else np.unique(adata.obs[group]) param_out = pd.DataFrame( index=adata.var.index, columns=[i + "_" + j for j in groups for i in ["alpha", "gamma", "u0", "l0"]], ) L, U = adata.layers["L"], adata.layers["U"] velocity_u, velocity_s = L, U for i in range(gene_num): all_parm = [] for cur_grp in groups.tolist(): cur_L, cur_U = ( (L[:, i], U[:, i]) if cur_grp == "" else ( L[adata.obs[group] == cur_grp, i], U[adata.obs[group] == cur_grp, i], ) ) if log: cur_U, cur_L = ( np.log1p(cur_U.toarray().squeeze()), np.log1p(cur_L.toarray().squeeze()), ) else: cur_U, cur_L = ( cur_U.toarray().squeeze(), cur_L.toarray().squeeze(), ) gamma, l0 = fit_gamma_labelling(T, cur_L, mode=None) alpha, u0 = fit_alpha_labelling(T, cur_U, gamma, mode=None) tmp = [alpha, gamma, u0, l0] all_parm.extend(tmp) velocity_u[:, i] = (cur_L - cur_U * gamma)[:, None] velocity_s[:, i] = (cur_U / (1 - np.exp(-1)) - cur_L)[:, None] adata.layers[cur_grp + "velocity_u"] = velocity_u adata.layers[cur_grp + "velocity_s"] = velocity_s param_out.iloc[i, :] = all_parm adata.uns.update({"dynamo_labelling": param_out}) return adata def compare_res_deprecated( adata, velocyto_res, svelo_res, dynamo_res, a_val, b_val, la_val, alpha_a_val, alpha_i_val, sigma_val, beta_val, gamma_val, ): """ function to compare results from velocyto and scvelo with our new method 0. retrieve gamm or gamma with other parameters from velocyto result or scvelo 1. plot the correlation between parameters estimated with different methods 2. calculate the correltion between those parameters """ # self._offset, self._offset2, self._beta, self._gamma, self._r2, self._velocity_genes velocyto_gammas = velocyto_res.gammas scvelo_gammas = svelo_res.var["velocity_gamma"] # scatter plot the true gammas with our result plt.subplots(figsize=(15, 5)) plt.plot() plt.subplot(131) plt.plot(gamma_val, velocyto_gammas, "o") plt.xlabel(r"True $\gamma$") plt.ylabel(r"$\gamma$ (velocyto)") plt.subplot(132) plt.plot(gamma_val, scvelo_gammas, "o") plt.xlabel(r"True $\gamma$") plt.ylabel(r"$\gamma$ (scvelo)") plt.subplot(133) plt.plot(gamma_val, dynamo_res.uns["dynamo"]["gamma"], "o") plt.xlabel(r"True $\gamma$") plt.ylabel(r"$\gamma$ (dynamo)") # what if we only have a small number of parameters? plt.subplots(figsize=(15, 5)) plt.plot() plt.subplot(131) plt.plot(alpha_a_val, svelo_res.var["fit_alpha"], "o") plt.xlabel(r"True alpha") plt.ylabel(r"$\alpha$ (scvelo)") plt.subplot(132) plt.plot(beta_val, svelo_res.var["fit_beta"], "o") plt.xlabel(r"True $\beta$") plt.ylabel(r"$\beta$ (scvelo)") plt.subplot(133) plt.plot(gamma_val, svelo_res.var["fit_gamma"], "o") plt.xlabel(r"True $\gamma$") plt.ylabel(r"$\gamma$ (scvelo)") # param_out = pd.DataFrame(index=adata.var.index, columns=['a', 'b', 'la', 'alpha_a', 'alpha_i', 'sigma', 'beta', 'gamma']) # what if we only have a small number of parameters? plt.subplots(figsize=(15, 15)) plt.subplot(331) plt.plot(a_val, adata.uns["dynamo"]["a"], "o") plt.xlabel(r"True $a$") plt.ylabel(r"$a$ (dynamo)") plt.subplot(332) plt.plot(b_val, adata.uns["dynamo"]["b"], "o") plt.xlabel(r"True $b$") plt.ylabel(r"$b$ (dynamo)") plt.subplot(333) plt.plot(la_val, adata.uns["dynamo"]["la"], "o") plt.xlabel(r"True $l_a$") plt.ylabel(r"$l_a$ (dynamo)") plt.subplot(334) plt.plot(alpha_a_val, adata.uns["dynamo"]["alpha_a"], "o") plt.xlabel(r"True $\alpha_a$") plt.ylabel(r"$\alpha_a$ (dynamo)") plt.subplot(335) plt.plot(alpha_i_val, adata.uns["dynamo"]["alpha_i"], "o") plt.xlabel(r"True $\alpha_i$") plt.ylabel(r"$\alpha_i$ (dynamo)") plt.subplot(336) plt.plot(sigma_val, adata.uns["dynamo"]["sigma"], "o") plt.xlabel(r"True $\sigma$") plt.ylabel(r"$\sigma$ (dynamo)") plt.subplot(337) plt.plot(beta_val, adata.uns["dynamo"]["beta"], "o") plt.xlabel(r"True $\beta$") plt.ylabel(r"$\beta$ (dynamo)") plt.subplot(338) plt.plot(gamma_val, adata.uns["dynamo"]["gamma"], "o") plt.xlabel(r"True $\gamma$") plt.ylabel(r"$\gamma$ (dynamo)") velocyto_coef = {"gamma": np.corrcoef(gamma_val, velocyto_gammas)[1, 0]} scvelo_coef = { "alpha": np.corrcoef(alpha_a_val, svelo_res.var["fit_alpha"])[1, 0], "beta": np.corrcoef(beta_val, svelo_res.var["fit_beta"])[1, 0], "gamma": np.corrcoef(gamma_val, svelo_res.var["fit_gamma"])[1, 0], } dynamo_coef = { "a": np.corrcoef(a_val, list(dynamo_res.uns["dynamo"]["a"]))[1, 0], "b": np.corrcoef(b_val, list(dynamo_res.uns["dynamo"]["b"]))[1, 0], "la": np.corrcoef(la_val, list(dynamo_res.uns["dynamo"]["la"]))[1, 0], "alpha_a": np.corrcoef(alpha_a_val, list(dynamo_res.uns["dynamo"]["alpha_a"]))[1, 0], "alpha_i": np.corrcoef(alpha_i_val, list(dynamo_res.uns["dynamo"]["alpha_i"]))[1, 0], "sigma": np.corrcoef(sigma_val, list(dynamo_res.uns["dynamo"]["sigma"]))[1, 0], "beta": np.corrcoef(beta_val, list(dynamo_res.uns["dynamo"]["beta"]))[1, 0], "gamma": np.corrcoef(gamma_val, list(dynamo_res.uns["dynamo"]["gamma"]))[1, 0], } return { "velocyto": pd.DataFrame.from_dict(velocyto_coef, orient="index").T, "scvelo": pd.DataFrame.from_dict(scvelo_coef, orient="index").T, "dynamo": pd.DataFrame.from_dict(dynamo_coef, orient="index").T, }