Monocle 2

[1]:
# load modules
import os
import numpy as np
import pandas as pd
import pickle

# Plotting imports
import matplotlib
import matplotlib.pyplot as plt

import palantir
import scanpy as sc
findfont: Font family ['Raleway'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Lato'] not found. Falling back to DejaVu Sans.
[3]:
%matplotlib inline

Download data

Anndata objects with all the data and metadata are publically avaiable at: https://s3.amazonaws.com/dp-lab-data-public/palantir/human_cd34_bm_rep[1-3].h5ad. This notebook use replicate 1 (https://s3.amazonaws.com/dp-lab-data-public/palantir/human_cd34_bm_rep1.h5ad) for illustration.

Description of the anndata object is available at https://s3.amazonaws.com/dp-lab-data-public/palantir/readme

Load data

[4]:
# Load the AnnData object
ad = sc.read('annadata/human_cd34_bm_rep1.h5ad')
colors = pd.Series(ad.uns['cluster_colors'])
ct_colors = pd.Series(ad.uns['ct_colors'])

Monocle 2

[5]:
boundary_clusters = pd.Series([])
boundary_clusters['HSC'] = 0
boundary_clusters['Mono'] = 3
boundary_clusters['Mega'] = 9
boundary_clusters['CLP'] = 5
boundary_clusters['Ery'] = 8
boundary_clusters['DC'] = 7
[6]:
# Gene subsets to be used for Monocle 2
statistic = pd.DataFrame(ad.varm['mast_diff_res_statistic'],
                        index=ad.var_names, columns=ad.uns['mast_diff_res_columns'])

gene_subsets = dict()
for ct in boundary_clusters.index:
    print(ct)
    others = boundary_clusters.index.difference([ct])
    index = 'Cluster{}'.format(boundary_clusters[ct])

    genes = statistic.index[(statistic.loc[:, index] > 0.1) &
                            (statistic.iloc[:, boundary_clusters[others]].max(axis=1) < 0.25)]
    genes = statistic.loc[genes, index].sort_values().index[::-1]

    if ct == 'DC':
        genes = genes[:100]
    if ct == 'Mega' or ct == 'CLP':
        genes = genes[:250]

    print(len(genes))
    gene_subsets[ct] = list(genes)

HSC
281
Mono
274
Mega
250
CLP
250
Ery
584
DC
100

We export data and use the Monocle2 R Script to compute the results. The exporting code is shown below

# Export data
monocle_dir = 'monocle2/'
os.makedirs(monocle_dir, exist_ok=True)

# Genes
genes = np.unique(np.concatenate(list(gene_subsets.values())))
# Expression matrix
counts = pd.DataFrame(ad.raw.X.todense(), index=ad.obs_names, columns=ad.var_names)
pd.DataFrame(counts).to_csv(f'{monocle_dir}/counts.csv')
pd.DataFrame(genes).to_csv(f'{monocle_dir}/genes.csv')

R snippet used is shown below was saved as monocle_wrapper.R

library(monocle, quiet=TRUE)

args <- commandArgs(trailingOnly = TRUE)
out_dir <- args[1]


print('Loading data...')
umi_matrix <- read.csv(sprintf("%s/counts.csv", out_dir), row.names=1)

# Annoated dataframes
pd = data.frame(cell_names = rownames(umi_matrix))
rownames(pd) = rownames(umi_matrix)
fd = data.frame(gene_short_name = colnames(umi_matrix))
rownames(fd) = colnames(umi_matrix)

HSMM <- newCellDataSet(as(t(umi_matrix), "sparseMatrix"),
    phenoData = AnnotatedDataFrame(pd),
    featureData = AnnotatedDataFrame(fd),
    expressionFamily=negbinomial.size())

# Size factors and dispersion
print('Size factors and dispersion')
HSMM <- estimateSizeFactors(HSMM)
HSMM <- estimateDispersions(HSMM)


# Differentially expressed genes from clusters
print('Ordering genes...')
ordering_genes <- read.csv(sprintf("%s/genes.csv", out_dir), stringsAsFactors=FALSE)[,2]
HSMM_subset <- setOrderingFilter(HSMM, ordering_genes)


print('Dimensionality reduction...')
HSMM_subset <- reduceDimension(HSMM_subset, max_components = 2,
    method = 'DDRTree')


HSMM_subset <- orderCells(HSMM_subset)
# early_cell = 'Run5_205922701598003'
early_cell = 'GSM2743164_rep1_tmouse_1068'
# Rerun with early state
HSMM_subset <- orderCells(HSMM_subset, root_state = pData(HSMM_subset)[early_cell, 'State'])


save(HSMM_subset, file=sprintf("%s/HSMM_subset.rda", out_dir))
write.csv(pData(HSMM_subset), sprintf("%s/phenodata.csv", out_dir), quote=FALSE)
write.csv(reducedDimS(HSMM_subset), sprintf("%s/red_dims.csv", out_dir), quote=FALSE)

The above script was invoked as shown below

import subprocess

ss_path = 'monocle2/monocle_wrapper.R'
args = f'Rscript {ss_path} {monocle_dir}'
print('Running Monocle2 using: {}\n'.format(args))
with subprocess.Popen(args,
                      stdout=None, stderr=subprocess.PIPE,
                      shell=True) as p:
    out, err = p.communicate()

The results from the above run is available at results/monocle2

Load results

[7]:
pdata = pd.read_csv('results/monocle2/phenodata.csv', index_col=0)
rdims = pd.read_csv('results/monocle2/red_dims.csv', index_col=0).T
rdims.columns = ['x', 'y']

States

[8]:
plt.figure(figsize=[5, 5])
states = pdata.State.unique()
s_colors = matplotlib.colormaps["hls"](range(len(states)))
s_colors = pd.Series([matplotlib.colors.rgb2hex(rgba) for rgba in s_colors], index = states)
plt.scatter(rdims.loc[:, 'x'], rdims.loc[:, 'y'], s=5,
           color=s_colors[pdata.State])
ax = plt.gca()
ax.set_axis_off()
../../_build/doctrees/nbsphinx/notebooks_comparisons_monocle2_20_0.png
[9]:
plt.figure(figsize=[5, 5])
plt.scatter(rdims.loc[:, 'x'], rdims.loc[:, 'y'], s=5,
           color=colors[ad.obs['clusters']])
ax = plt.gca()
ax.set_axis_off()
../../_build/doctrees/nbsphinx/notebooks_comparisons_monocle2_21_0.png

Gene expression

[10]:
imputed_data = pd.DataFrame(ad.obsm['MAGIC_imputed_data'],
                           index=ad.obs_names, columns=ad.var_names)
[11]:
genes = ['CD34', 'MPO', 'CD79B', 'GATA1', 'CSF1R', 'ITGA2B']
labels = ['CD34', 'MPO', 'CD79B', 'GATA1', 'CSF1R', 'CD41']

fig = palantir.plot.FigureGrid(6, 3)
for gene, label, ax in zip(genes, labels, fig):
    ax.scatter(rdims.loc[:, 'x'], rdims.loc[:, 'y'],
              s=5, cmap=matplotlib.cm.Spectral_r, c=imputed_data.loc[rdims.index, gene])
    ax.set_axis_off()
    ax.set_title(label)

../../_build/doctrees/nbsphinx/notebooks_comparisons_monocle2_24_0.png