scEU-seq organoid

This tutorial uses the intestine organoid data from Battich, et al (2020). This tutorial is the second one of the two tutorials for demonstrating how dynamo can use used to analyze the scEU-seq data. Please refer the cell cycle tutorial for details on how to analyze the cell cycle dataset.

# !pip install git+https://github.com/aristoteleo/dynamo-release@master
import warnings
warnings.filterwarnings('ignore')

import dynamo as dyn
import anndata
import pandas as pd
import numpy as np
import scipy.sparse

from anndata import AnnData
from scipy.sparse import csr_matrix

# dyn.get_all_dependencies_version()

Load data

organoid = dyn.sample_data.scEU_seq_organoid()

|-----> Downloading scEU_seq data
|-----> Downloading data to ./data/organoid.h5ad
|-----> in progress: 99.0108%|-----> [download] completed [17.9992s]

organoid

AnnData object with n_obs × n_vars = 3831 × 9157
    obs: 'well_id', 'batch_id', 'treatment_id', 'log10_gfp', 'rotated_umap1', 'rotated_umap2', 'som_cluster_id', 'monocle_branch_id', 'monocle_pseudotime', 'exp_type', 'time'
    var: 'ID', 'NAME'
    layers: 'sl', 'su', 'ul', 'uu'

# mapping:
cell_mapper = {
    '1': 'Enterocytes',
    '2': 'Enterocytes',
    '3': 'Enteroendocrine',
    '4': 'Enteroendocrine progenitor',
    '5': 'Tuft cells',
    '6': 'TA cells',
    '7': 'TA cells',
    '8': 'Stem cells',
    '9': 'Paneth cells',
    '10': 'Goblet cells',
    '11': 'Stem cells',
 }

organoid.obs['cell_type'] = organoid.obs.som_cluster_id.map(cell_mapper).astype('str')

typical dynamo analysis workflow

dyn.pl.basic_stats(organoid)

organoid

AnnData object with n_obs × n_vars = 3831 × 9157
    obs: 'well_id', 'batch_id', 'treatment_id', 'log10_gfp', 'rotated_umap1', 'rotated_umap2', 'som_cluster_id', 'monocle_branch_id', 'monocle_pseudotime', 'exp_type', 'time', 'cell_type', 'nGenes', 'nCounts', 'pMito'
    var: 'ID', 'NAME', 'nCells', 'nCounts'
    layers: 'sl', 'su', 'ul', 'uu'

organoid.obs

	well_id	batch_id	treatment_id	log10_gfp	rotated_umap1	rotated_umap2	som_cluster_id	monocle_branch_id	monocle_pseudotime	exp_type	time	cell_type	nGenes	nCounts	pMito
1	14	01	Pulse_120	12.8929281522	23.0662174225	-3.47039175034	6	2	6.08688834859	Pulse	120	TA cells	1054	1426.0	0.0
2	15	01	Pulse_120	5.85486775252	25.710735321	-1.31835341454	2	2	9.14740876358	Pulse	120	Enterocytes	1900	3712.0	0.0
3	16	01	Pulse_120	7.45690471634	26.7709560394	-1.06682777405	2	2	9.69134627386	Pulse	120	Enterocytes	2547	6969.0	0.0
4	17	01	Pulse_120	94.2814535609	21.2927913666	0.0159659013152	11	2	4.2635104705	Pulse	120	Stem cells	1004	1263.0	0.0
5	21	01	Pulse_120	47.1412266395	19.9096126556	0.884054124355	11	1	2.62248093423	Pulse	120	Stem cells	927	1144.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3827	378	12	Pulse_120	32.496816667	20.7663478851	-3.72811675072	8	3	7.32939908351	Pulse	120	Stem cells	2268	3918.0	0.0
3828	379	12	Pulse_120	78.1198193763	20.1073760986	-2.65023303032	8	3	5.10436147713	Pulse	120	Stem cells	2131	3619.0	0.0
3829	380	12	Pulse_120	53.249846399	20.1618804932	-3.83158016205	8	3	6.43742448317	Pulse	120	Stem cells	2141	3603.0	0.0
3830	381	12	Pulse_dmso	16.7070737849	15.4272613525	-2.15779066086	10	1	0.657880511889	Pulse	dmso	Goblet cells	1158	1683.0	0.0
3831	383	12	Pulse_dmso	93.3716092195	21.5953540802	-3.90664196014	6	2	4.81727202212	Pulse	dmso	TA cells	1374	1838.0	0.0

3831 rows × 15 columns

organoid.obs.groupby(['exp_type', 'time']).agg('count')

		well_id	batch_id	treatment_id	log10_gfp	rotated_umap1	rotated_umap2	som_cluster_id	monocle_branch_id	monocle_pseudotime	cell_type	nGenes	nCounts	pMito
exp_type	time
Chase	0	660	660	660	660	660	660	660	660	660	660	660	660	660
	45	821	821	821	821	821	821	821	821	821	821	821	821	821
	120	0	0	0	0	0	0	0	0	0	0	0	0	0
	360	646	646	646	646	646	646	646	646	646	646	646	646	646
	dmso	0	0	0	0	0	0	0	0	0	0	0	0	0
Pulse	0	0	0	0	0	0	0	0	0	0	0	0	0	0
	45	0	0	0	0	0	0	0	0	0	0	0	0	0
	120	1373	1373	1373	1373	1373	1373	1373	1373	1373	1373	1373	1373	1373
	360	0	0	0	0	0	0	0	0	0	0	0	0	0
	dmso	331	331	331	331	331	331	331	331	331	331	331	331	331

adata = organoid.copy()
adata.obs.time = adata.obs.time.astype('str')
adata.obs.loc[adata.obs['time'] == 'dmso', 'time'] = -1
adata.obs['time'] = adata.obs['time'].astype(float)
adata = adata[adata.obs.time != -1, :]
adata = adata[adata.obs.exp_type == 'Pulse', :]
adata.layers['new'], adata.layers['total'] = adata.layers['ul'] + adata.layers['sl'], adata.layers['su'] + adata.layers['sl'] + adata.layers['uu'] + adata.layers['ul']
del adata.layers['uu'], adata.layers['ul'], adata.layers['su'], adata.layers['sl']
# dyn.pp.recipe_monocle(adata, n_top_genes=1000, total_layers=False)
preprocessor = dyn.pp.Preprocessor(cell_cycle_score_enable=True)
preprocessor.config_monocle_recipe(adata, n_top_genes=1000)
preprocessor.filter_cells_by_outliers_kwargs["keep_filtered"] = True
preprocessor.preprocess_adata_monocle(adata)
dyn.pl.basic_stats(adata)
dyn.pl.show_fraction(organoid)

|-----> Running monocle preprocessing pipeline...
|-----? No 'tkey' value was given despite 'tkey' information in the adata, so we will use 'time' in the adata as the default.
|-----------> filtered out 0 outlier cells
|-----------> filtered out 815 outlier genes
|-----> PCA dimension reduction
|-----> <insert> X_pca to obsm in AnnData Object.
|-----> computing cell phase...
|-----> [Cell Phase Estimation] completed [20.6928s]
|-----> [Cell Cycle Scores Estimation] completed [0.0563s]
|-----> [Preprocessor-monocle] completed [1.9278s]

adata.obs.time = adata.obs.time/60

adata.obs.time  = adata.obs.time.astype('float')
dyn.tl.dynamics(adata, model='deterministic', tkey='time', assumption_mRNA='ss')

dyn.tl.reduceDimension(adata)

|-----> dynamics_del_2nd_moments_key is None. Using default value from DynamoAdataConfig: dynamics_del_2nd_moments_key=False
|-----------> removing existing M layers:[]...
|-----------> making adata smooth...
|-----> calculating first/second moments...
|-----> [moments calculation] completed [6.7972s]
|-----? Your adata only has labeling data, but NTR_vel is set to be False. Dynamo will reset it to True to enable this analysis.

estimating gamma: 100%|█████████████████████| 1000/1000 [00:10<00:00, 91.65it/s]

|-----> retrieve data for non-linear dimension reduction...
|-----> [UMAP] using X_pca with n_pca_components = 30
|-----> <insert> X_umap to obsm in AnnData Object.
|-----> [UMAP] completed [7.7419s]

dyn.tl.cell_velocities(adata, ekey='M_t', vkey='velocity_T', enforce=True)

|-----> incomplete neighbor graph info detected: connectivities and distances do not exist in adata.obsp, indices not in adata.uns.neighbors.
|-----> Neighbor graph is broken, recomputing....
|-----> Start computing neighbor graph...
|-----------> X_data is None, fetching or recomputing...
|-----> fetching X data from layer:None, basis:pca
|-----> method arg is None, choosing methods automatically...
|-----------> method ball_tree selected
|-----> [calculating transition matrix via pearson kernel with sqrt transform.] in progress: 100.0000%|-----> [calculating transition matrix via pearson kernel with sqrt transform.] completed [2.5574s]
|-----> [projecting velocity vector to low dimensional embedding] in progress: 100.0000%|-----> [projecting velocity vector to low dimensional embedding] completed [0.2453s]

AnnData object with n_obs × n_vars = 1373 × 9157
    obs: 'well_id', 'batch_id', 'treatment_id', 'log10_gfp', 'rotated_umap1', 'rotated_umap2', 'som_cluster_id', 'monocle_branch_id', 'monocle_pseudotime', 'exp_type', 'time', 'cell_type', 'nGenes', 'nCounts', 'pMito', 'pass_basic_filter', 'total_Size_Factor', 'initial_total_cell_size', 'new_Size_Factor', 'initial_new_cell_size', 'Size_Factor', 'initial_cell_size', 'ntr', 'cell_cycle_phase'
    var: 'ID', 'NAME', 'nCells', 'nCounts', 'pass_basic_filter', 'log_m', 'log_cv', 'score', 'frac', 'use_for_pca', 'ntr', 'alpha', 'beta', 'gamma', 'half_life', 'alpha_b', 'alpha_r2', 'gamma_b', 'gamma_r2', 'gamma_logLL', 'delta_b', 'delta_r2', 'bs', 'bf', 'uu0', 'ul0', 'su0', 'sl0', 'U0', 'S0', 'total0', 'beta_k', 'gamma_k', 'use_for_dynamics', 'use_for_transition'
    uns: 'pp', 'velocyto_SVR', 'feature_selection', 'PCs', 'explained_variance_ratio_', 'pca_mean', 'cell_phase_genes', 'dynamics', 'neighbors', 'umap_fit', 'grid_velocity_umap'
    obsm: 'X_pca', 'cell_cycle_scores', 'X_umap', 'velocity_umap'
    varm: 'alpha'
    layers: 'new', 'total', 'X_total', 'X_new', 'M_t', 'M_tt', 'M_n', 'M_tn', 'M_nn', 'velocity_N', 'velocity_T'
    obsp: 'moments_con', 'distances', 'connectivities', 'pearson_transition_matrix'

adata.obsm['X_umap_ori'] = adata.obs.loc[:, ['rotated_umap1', 'rotated_umap2']].values.astype(float)

Visualize time-resolved vector flow learned with dynamo

dyn.tl.cell_velocities(adata, basis='umap_ori')

dyn.pl.streamline_plot(adata, color='cell_type', basis='umap_ori')

Using existing pearson_transition_matrix found in .obsp.
|-----> [projecting velocity vector to low dimensional embedding] in progress: 100.0000%|-----> [projecting velocity vector to low dimensional embedding] completed [0.2327s]
|-----------> plotting with basis key=X_umap_ori
|-----------> skip filtering cell_type by stack threshold when stacking color because it is not a numeric type

dyn.pl.streamline_plot(adata, color='cell_cycle_phase', basis='umap_ori')

|-----------> plotting with basis key=X_umap_ori
|-----------> skip filtering cell_cycle_phase by stack threshold when stacking color because it is not a numeric type

adata.var_names[adata.var.use_for_transition][:5]

Index(['Cdc45', 'Brat1', 'Ccnd2', 'Ckmt1', 'Pdgfb'], dtype='object')

dyn.pl.phase_portraits(adata, genes=['Brat1', 'Ccnd2', 'Ckmt1', 'Pdgfb', 'Gpa33'],
                       color='som_cluster_id', basis='umap_ori')

Animate intestine organoid differentiation

dyn.vf.VectorField(adata, basis='umap_ori')

|-----> VectorField reconstruction begins...
|-----> Retrieve X and V based on basis: UMAP_ORI.
        Vector field will be learned in the UMAP_ORI space.
|-----> Generating high dimensional grids and convert into a row matrix.
|-----> Learning vector field with method: sparsevfc.
|-----> [SparseVFC] begins...
|-----> Sampling control points based on data velocity magnitude...
|-----> [SparseVFC] completed [0.1492s]
|-----> [VectorField] completed [0.1768s]

progenitor = adata.obs_names[adata.obs.cell_type == 'Stem cells']
len(progenitor)

np.random.seed(19491001)

from matplotlib import animation
info_genes = adata.var_names[adata.var.use_for_transition]
dyn.pd.fate(adata, basis='umap_ori', init_cells=progenitor[:100], interpolation_num=100,  direction='forward',
   inverse_transform=False, average=False)

integration with ivp solver: 100%|████████████| 100/100 [00:08<00:00, 11.67it/s]
uniformly sampling points along a trajectory: 100%|█| 100/100 [00:00<00:00, 550.

AnnData object with n_obs × n_vars = 1373 × 9157
    obs: 'well_id', 'batch_id', 'treatment_id', 'log10_gfp', 'rotated_umap1', 'rotated_umap2', 'som_cluster_id', 'monocle_branch_id', 'monocle_pseudotime', 'exp_type', 'time', 'cell_type', 'nGenes', 'nCounts', 'pMito', 'pass_basic_filter', 'total_Size_Factor', 'initial_total_cell_size', 'new_Size_Factor', 'initial_new_cell_size', 'Size_Factor', 'initial_cell_size', 'ntr', 'cell_cycle_phase', 'control_point_umap_ori', 'inlier_prob_umap_ori', 'obs_vf_angle_umap_ori'
    var: 'ID', 'NAME', 'nCells', 'nCounts', 'pass_basic_filter', 'log_m', 'log_cv', 'score', 'frac', 'use_for_pca', 'ntr', 'alpha', 'beta', 'gamma', 'half_life', 'alpha_b', 'alpha_r2', 'gamma_b', 'gamma_r2', 'gamma_logLL', 'delta_b', 'delta_r2', 'bs', 'bf', 'uu0', 'ul0', 'su0', 'sl0', 'U0', 'S0', 'total0', 'beta_k', 'gamma_k', 'use_for_dynamics', 'use_for_transition'
    uns: 'pp', 'velocyto_SVR', 'feature_selection', 'PCs', 'explained_variance_ratio_', 'pca_mean', 'cell_phase_genes', 'dynamics', 'neighbors', 'umap_fit', 'grid_velocity_umap', 'grid_velocity_umap_ori', 'cell_type_colors', 'cell_cycle_phase_colors', 'VecFld_umap_ori', 'fate_umap_ori'
    obsm: 'X_pca', 'cell_cycle_scores', 'X_umap', 'velocity_umap', 'X_umap_ori', 'velocity_umap_ori', 'velocity_umap_ori_SparseVFC', 'X_umap_ori_SparseVFC'
    varm: 'alpha'
    layers: 'new', 'total', 'X_total', 'X_new', 'M_t', 'M_tt', 'M_n', 'M_tn', 'M_nn', 'velocity_N', 'velocity_T'
    obsp: 'moments_con', 'distances', 'connectivities', 'pearson_transition_matrix'

%%capture
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax = dyn.pl.topography(adata, basis='umap_ori', color='cell_type', ax=ax, save_show_or_return='return',  figsize=(24, 24))
ax.set_aspect(0.8)

|-----> VectorField reconstruction begins...
|-----> Retrieve X and V based on basis: UMAP.
        Vector field will be learned in the UMAP space.
|-----> Generating high dimensional grids and convert into a row matrix.
|-----> Learning vector field with method: sparsevfc.
|-----> [SparseVFC] begins...
|-----> Sampling control points based on data velocity magnitude...
|-----> [SparseVFC] completed [0.1355s]
|-----> Mapping topography...
|-----> [VectorField] completed [1.8211s]
|-----------> plotting with basis key=X_umap_ori
|-----------> skip filtering cell_type by stack threshold when stacking color because it is not a numeric type

%%capture
adata.obs['time'] = adata.obs.time.astype('float')
instance = dyn.mv.StreamFuncAnim(adata=adata, basis='umap_ori', color='cell_type', ax=ax)

|-----? the number of cell states with fate prediction is more than 50. You may want to lower the max number of cell states to draw via cell_states argument.
|-----------> plotting with basis key=X_umap_ori
|-----------> skip filtering cell_type by stack threshold when stacking color because it is not a numeric type