In this demo code, we demonstrate how to use Portal to integrate SS2 and 10X scRNA-seq datasets from the Tabula Muris project. We use the integration of two bladder datasets as an example.
The datasets can be downloaded from https://figshare.com/projects/Tabula_Muris_Transcriptomic_characterization_of_20_organs_and_tissues_from_Mus_musculus_at_single_cell_resolution/27733, and saved in folder "./Raw_data_TabulaMuris", including 10X scRNA-seq dataset (Single-cell RNA-seq data from microfluidic emulsion (v2)) and SS2 scRNA-seq dataset ( Single-cell RNA-seq data from Smart-seq2 sequencing of FACS sorted cells (v2)). The files need to be unzipped after downloading.
import os
import numpy as np
import pandas as pd
import scanpy as sc
sc.settings.verbosity = 3
sc.logging.print_header()
# Create a folder for saving preprocessed data
tissue = "Bladder"
data_path = "data/" + tissue
if not os.path.exists(data_path):
os.makedirs(data_path)
path_SS2 = "./Raw_data_TabulaMuris/FACS"
path_10X = "./Raw_data_TabulaMuris/droplet"
# Read annotation
meta_SS2 = pd.read_csv("./Raw_data_TabulaMuris/annotations_facs.csv",
keep_default_na=False)
meta_10X = pd.read_csv("./Raw_data_TabulaMuris/annotations_droplet.csv",
keep_default_na=False)
# Read SS2 cell-by-gene counts
adata_SS2 = sc.read_csv(os.path.join(path_SS2, "%s-counts.csv" % tissue)).transpose()
ERCC_idx = pd.Series(adata_SS2.var.index).str.startswith('ERCC')
cell_idx = adata_SS2.obs.index.isin(meta_SS2[(meta_SS2.cell_ontology_class != 0) &
(meta_SS2.cell_ontology_class != "")].cell)
adata_SS2 = adata_SS2[cell_idx, -ERCC_idx]
# Read 10X cell-by-gene counts
channels = sorted(set(meta_10X[meta_10X.tissue == tissue].channel))
for i, channel in enumerate(channels):
if i == 0:
adata_10X = sc.read_10x_mtx(path_10X + '/%s-%s/' % (tissue, channel),
var_names='gene_symbols',cache=False)
adata_10X.obs.index = channel + "_" + adata_10X.obs.index
adata_10X.obs.index = adata_10X.obs.index.map(lambda x: x[:-2])
cell_idx = adata_10X.obs.index.isin(meta_10X[(meta_10X.cell_ontology_class != 0) &
(meta_10X.cell_ontology_class != "")].cell)
adata_10X = adata_10X[cell_idx, :]
else:
tmp = sc.read_10x_mtx(path_10X + '/%s-%s/' % (tissue, channel),
var_names='gene_symbols',cache=False)
tmp.obs.index = channel + "_" + tmp.obs.index
tmp.obs.index = tmp.obs.index.map(lambda x: x[:-2])
cell_idx = tmp.obs.index.isin(meta_10X[(meta_10X.cell_ontology_class != 0) &
(meta_10X.cell_ontology_class != "")].cell)
adata_10X = adata_10X.concatenate(tmp[cell_idx, :], index_unique=None)
celltype_SS2 = meta_SS2[meta_SS2.cell.isin(adata_SS2.obs.index)][["cell", "cell_ontology_class"]].set_index("cell")
celltype_SS2["method"] = "SS2"
celltype_10X = meta_10X[meta_10X.cell.isin(adata_10X.obs.index)][["cell", "cell_ontology_class"]].set_index("cell")
celltype_10X["method"] = "10X"
meta = pd.concat([celltype_SS2, celltype_10X]).rename(columns={"cell_ontology_class": "celltype"})
meta.to_pickle(os.path.join(data_path, "meta_raw.pkl"))
sc.pp.filter_cells(adata_SS2, min_genes=200)
sc.pp.filter_genes(adata_SS2, min_cells=3)
sc.pp.filter_cells(adata_10X, min_genes=200)
sc.pp.filter_genes(adata_10X, min_cells=3)
meta = pd.read_pickle(os.path.join(data_path, "meta_raw.pkl"))
meta = meta.loc[list(adata_SS2.obs.index) + list(adata_10X.obs.index), ]
meta.to_pickle(os.path.join(data_path, "meta.pkl"))
import portal
# Specify the GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# Create a folder for saving results
result_path = "result"
if not os.path.exists(result_path):
os.makedirs(result_path)
model = portal.model.Model(training_steps=1000)
model.preprocess(adata_SS2, adata_10X) # perform preprocess and PCA
model.train() # train the model
model.eval() # get integrated latent representation of cells
# The integration result can be visualized by portal.utils.plot_UMAP
portal.utils.plot_UMAP(model.latent, meta, colors=["method", "celltype"], save=True, result_path=result_path)
To deal with different scenarios, our package enables tuning the parameter lambda_cos to achieve a better mixing performance:
lowdim_list = portal.utils.preprocess_datasets([adata_SS2, adata_10X], hvg_num=4000, data_path=data_path)
integrated_data = portal.utils.integrate_datasets(lowdim_list,
search_cos=True,
training_steps=1000,
data_path=data_path)
portal.utils.plot_UMAP(integrated_data, meta, colors=["method", "celltype"], save=True, result_path=result_path)