DEG.Rmd
For this tutorial we will analyze the Human Gastrulation dataset reported by Tyser 2021. The dataset consists of 1195 cells. We will perform differential expression analysis using two methods: co-dependency index (CDI) and wilcoxon test.
We start by reading in the data and visualizing the annotated population. We are interested in identifying cell-type specific markers.
# load package
library(scMiko)
# load human gastrulation data
<- readRDS("../data/demo/so_tyser2021_220621.rds")
so.query
# visualize clusters
cluster.UMAP(so = so.query, group.by = "sub_cluster") + theme_void() + labs(title = "Tyser 2021",
subtitle = "Human Gastrulation")
To identifying cell-type specific markers, we will perform CDI- and Wilcoxon-based differentially-expressed gene (DEG) analyses. Since CDI is more computationally intensive, we will focus on the subset of genes that are expressed within the dataset.
# get expressed genes
<- getExpressedGenes(object = so.query)
expr_gene
# CDI
<- findCDIMarkers(object = so.query, features.x = "sub_cluster", features.y = expr_gene)
cdi_output
# Wilcoxon
<- getDEG(object = so.query, group_by = "sub_cluster", auc.thresh = NA, fdr.thresh = NA,
wilcoxon_output return.all = T, return.list = F)
Once the analysis has be complete, we can visualize the top
differentially-expressed genes using Seurat’s DotPlot
function.
# get top markers
<- wilcoxon_output %>%
wilcoxon_output.top ::group_by(group) %>%
dplyr::top_n(n = 1, wt = auc)
dplyr$group <- factor(gsub("sub_cluster_", "", cdi_output$feature.x), levels = unique(wilcoxon_output.top$group))
cdi_output<- cdi_output %>%
cdi_output ::arrange(group)
dplyr<- cdi_output %>%
cdi_output.top ::group_by(feature.x) %>%
dplyr::top_n(n = 1, wt = ncdi)
dplyr
# generate dot plots
<- DotPlot(object = so.query, features = unique(cdi_output.top$feature.y), group.by = "sub_cluster") +
plt.cdi_dot scale_color_miko() + labs(title = "CDI Markers", y = "Clusters", x = "Genes") + theme_miko(legend = T,
x.axis.rotation = 45) + theme(legend.position = "bottom")
<- DotPlot(object = so.query, features = unique(wilcoxon_output.top$feature), group.by = "sub_cluster") +
plt.wilcoxon_dot scale_color_miko() + labs(title = "Wilcoxon Markers", y = "Clusters", x = "Genes") + theme_miko(legend = T,
x.axis.rotation = 45) + theme(legend.position = "bottom")
# visualize
print(cowplot::plot_grid(plt.cdi_dot, plt.wilcoxon_dot, nrow = 1))
Finally, we can compare the relative sensitivity and specificity of markers obtained by each method.
# get top 10 markers per cluster
<- cdi_output %>%
cdi_output.top ::group_by(feature.x) %>%
dplyr::top_n(n = 10, wt = ncdi)
dplyr<- wilcoxon_output %>%
wilcoxon_output.top ::group_by(group) %>%
dplyr::top_n(n = 10, wt = auc)
dplyr
# get sensitivity and specificity indices that were calculated by getDEG()
<- wilcoxon_output %>%
cdi_sensitivity ::filter(paste0(feature, "_", group) %in% paste0(cdi_output.top$feature.y, "_", cdi_output.top$group))
dplyr$method <- "CDI"
cdi_sensitivity
<- wilcoxon_output %>%
wilcoxon_sensitivity ::filter(paste0(feature, "_", group) %in% paste0(wilcoxon_output.top$feature, "_", wilcoxon_output.top$group))
dplyr$method <- "Wilcoxon"
wilcoxon_sensitivity
<- bind_rows(cdi_sensitivity, wilcoxon_sensitivity)
df.eval
# generate plots
<- df.eval %>%
plt_sensitivity ggplot(aes(x = method, y = sensitivity, fill = method)) + geom_boxplot() + ggbeeswarm::geom_quasirandom() +
theme_miko(fill.palette = "ptol") + labs(title = "Sensitivity") + ylim(c(0, 1))
<- df.eval %>%
plt_specificity ggplot(aes(x = method, y = specificity, fill = method)) + geom_boxplot() + ggbeeswarm::geom_quasirandom() +
theme_miko(fill.palette = "ptol") + labs(title = "Specificity") + ylim(c(0, 1))
::plot_grid(plt_sensitivity, plt_specificity) cowplot
In many instances, single-cell data from multiple independent biological replicates are pooled. To minimize batch effects and ensure the validity of the CDI across independent samples, we recommend one of two alternative approaches:
We will use the Pijuan-Sala (2019) gastrulation data to demonstrate each approach. First we load in the count matrix, split it into component datasets and normalize each, and then merge the normalized data into a single Seurat object. Splitting the Seurat object into component data sets allows us to fit a SCT Model to each dataset.
# load murine gastrulation data
<- readRDS("../data/demo/ps_count_matrix.rds")
ps.data <- readRDS("../data/demo/ps_meta_data.rds")
ps.meta
# create seurat object
<- CreateSeuratObject(counts = ps.data, meta.data = ps.meta)
so.ps
# split by sample
<- SplitObject(object = so.ps, split.by = "sample")
so.ps.list
# sample-wise normalization of data
<- lapply(X = so.ps.list, FUN = SCTransform, method = "glmGamPoi", verbose = F, vst.flavor = "v2",
so.ps.list variable.features.rv.th = 1.3)
# merge normalized data
<- merge(so.ps.list[[1]], so.ps.list[-1]) so.ps
In the first approach, we will use the Seurat object with multiple independent samples and preprocess it using Seurat’s PrepSCTFindMarkers() function which effectively downsamples the count matrix to ensure a homogeneous sequencing depth across all component datasets. The processed Seurat data will then be provided as input into findCDIMarkers():
# enforce homogeneous sequencing depth across all samples
<- PrepSCTFindMarkers(so.ps)
so.ps_homogenous
# run CDI on heterogeneous sequencing depth data
<- findCDIMarkers(so.ps, features.x = "celltype", verbose = F)
cdi.heterogenous
# run CDI on homogeneous sequencing depth data
<- findCDIMarkers(so.ps_homogenous, features.x = "celltype", verbose = F)
cdi.homogeneous
# merge CDI results from both runs for comparison
<- merge(cdi.heterogenous, cdi.homogeneous, by = c("feature.x", "feature.y"))
cdi.merge.approach1
# plot CDI scores from depth-normalized run to visulaize how much results have changed
<- signif(cor(cdi.merge.approach1$ncdi.x, cdi.merge.approach1$ncdi.y), 3)
r.val <- cdi.merge.approach1 %>%
plt.comparison.approach1 ggplot(aes(x = ncdi.x, y = ncdi.y)) + geom_point() + labs(title = "Batch correction using Approach 1",
x = "nCDI (using non-corrected counts)", y = "nCDI (using corrected counts)", subtitle = paste0("Pearson's r = ",
+ theme_miko(legend = T) + geom_abline(linetype = "dashed", color = "tomato")
r.val))
plt.comparison.approach1
In the second approach, we will apply a meta-analytic approach to pool CDI results from independent samples thereby finding markers that are conserved across samples. P values from independent sample-specific estimates are pooled using the Fisher’s Method.
# run CDI for each sample and pool the sample-level statistics
<- findConservedCDIMarkers(object = so.ps, features.x = "celltype", group.by = "sample",
cdi.conserved n.workers = 15, verbose = F)
# merge CDI results from both runs for comparison
<- merge(cdi.heterogenous, cdi.conserved, by = c("feature.x", "feature.y"))
cdi.merge.approach2
# plot CDI scores from conserved CDI run to visulaize how much results have changed
<- signif(cor(cdi.merge.approach2$ncdi, cdi.merge.approach2$ncdi_mean), 3)
r.val.approach2 <- cdi.merge.approach2 %>%
plt.comparison.approach2 ggplot(aes(x = ncdi, y = ncdi_mean)) + geom_point() + labs(title = "Batch correction using Approach 2",
x = "nCDI (using non-corrected counts)", y = "nCDI (averaged across samples)", subtitle = paste0("Pearson's r = ",
+ theme_miko(legend = T) + geom_abline(linetype = "dashed", color = "tomato")
r.val.approach2))
plt.comparison.approach2
Here we compare the batch-normalized CDI results (approach 1 and 2) with uncorrected CDI results. Using a 5% FDR threshold, the overlap in significant gene sets is found to vary depending on which approach is taken. Here we show that Approach 1 yielded more consistent results with the uncorrected CDI results, whereas Approach 2 yielded less consistent results. In the latter, the results suggest the presence of batch-specific artefacts that may have contributed to slightly different results in the uncorrected CDI analysis. Although Approach 2 is more computationally intensive, it is rooted in established meta-analytic theory and should be employed when evaluated independent experimental samples.
# get DEGs at 5% FDR
<- longDF2namedList(cdi.heterogenous %>%
DEG.uncorrected ::filter(fdr < 0.05), group_by = "feature.x", values = "feature.y")
dplyr<- longDF2namedList(cdi.homogeneous %>%
DEG.approach1 ::filter(fdr < 0.05), group_by = "feature.x", values = "feature.y")
dplyr<- longDF2namedList(cdi.conserved %>%
DEG.approach2 ::filter(fdr < 0.05), group_by = "feature.x", values = "feature.y")
dplyr
# annotate method used
names(DEG.uncorrected) <- paste0("uncorrected_", names(DEG.uncorrected))
names(DEG.approach1) <- paste0("approach1_", names(DEG.approach1))
names(DEG.approach2) <- paste0("approach2_", names(DEG.approach2))
# combine DEG lists and compute jaccard similarities
<- c(DEG.uncorrected, DEG.approach1, DEG.approach2)
DEG.list <- jaccardSimilarityMatrix(DEG.list)
jmat
<- jmat[grepl("uncorrected", rownames(jmat)), grepl("approach1", rownames(jmat))]
unc_a1 <- jmat[grepl("uncorrected", rownames(jmat)), grepl("approach2", rownames(jmat))]
unc_a2 <- jmat[grepl("approach1", rownames(jmat)), grepl("approach2", rownames(jmat))]
a1_a2
<- apply(unc_a1, 1, max)
unc_a1.max <- apply(unc_a2, 1, max)
unc_a2.max <- apply(a1_a2, 1, max)
a1_a2.max
# combine jaccard similarity profiles from each method comparison
<- bind_rows(data.frame(Group_1 = "uncorrected", Group_2 = "approach1", DEG_overlap = unc_a1.max),
df.deg.overlap data.frame(Group_1 = "uncorrected", Group_2 = "approach2", DEG_overlap = unc_a2.max), data.frame(Group_1 = "approach1",
Group_2 = "approach2", DEG_overlap = a1_a2.max))
# plot jaccard similarity profiles between batch corrected CDI methods
<- df.deg.overlap %>%
plt.deg.overlap ggplot(aes(x = paste0(Group_1, "_vs_", Group_2), y = DEG_overlap)) + geom_boxplot(fill = "grey") +
::geom_quasirandom() + theme_miko() + labs(title = "DEG Overlap", subtitle = "Jaccard Similarity of DEGs (per cluster)",
ggbeeswarmy = "Jaccard Similarity", x = "")
plt.deg.overlap