scATAC-pro generates results in plain texts, tables and .rds objects. This tutorial shows how some modules can be re-run using different options or parameters. We will use 10x PBMC 10x data as in the manuscript, except for the integrate module, where data from another study was used for illustration purpose. Download pbmc10x_scATACpro_output data to reproduce this tutorial.
seurat_obj = readRDS(paste0(down_dir, 'seurat_obj.rds'))
seurat_obj <- RunPCA(seurat_obj, npcs = 20)
seurat_obj <- FindNeighbors(seurat_obj, reduction = 'pca', dims = 1:20)
seurat_obj <- FindClusters(seurat_obj, resolution = 0.4)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 6395
## Number of edges: 271244
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9153
## Number of communities: 12
## Elapsed time: 0 seconds
seurat_obj$active_clusters = seurat_obj$seurat_clusters
DimPlot(seurat_obj)
resl <- queryResolution4Seurat(seurat_obj, k = 8, reduction = 'pca',
npc = 20, min_resl = 0.1, max_resl = 1,
max_iter = 15)
seurat_obj <- FindClusters(seurat_obj, resolution = resl)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 6395
## Number of edges: 271244
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9601
## Number of communities: 8
## Elapsed time: 1 seconds
seurat_obj$active_clusters = seurat_obj$seurat_clusters
DimPlot(seurat_obj)
## load original cell-peak matrix
mtx = readRDS(paste0(output_dir, 'filtered_matrix/', PEAK_CALLER, '/', CELL_CALLER, '/matrix_doubletsRemoved.rds'))
mtx <- filterMat(mtx, minFrac_in_cell = 0.01, min_depth = 1000, max_depth = 50000)
## annotate rownames (peaks) with gene
tss.ann <- fread('https://raw.githubusercontent.com/wbaopaul/scATAC-pro/master/annotation/hg38_tss.bed')
names(tss.ann)[c(1:4, 7)] = c('chr', 'start', 'end', 'gene_name', 'gene_type')
tss.ann = subset(tss.ann, gene_type %in% c('protein_coding', 'miRNA', 'lincRNA'))
mtx.ann = assignGene2Peak(mtx, tss.ann) ## rename the rowname mtx with annotated peaks
rownames(mtx.ann)[1:10]
## [1] "chr1-100028739-100029280,MFSD14A"
## [2] "chr1-100034409-100034909,MFSD14A"
## [3] "chr1-100037608-100039203,MFSD14A-Tss"
## [4] "chr1-100046723-100047223,MFSD14A"
## [5] "chr1-1000611-1001111,HES4-Tss,ISG15-Tss"
## [6] "chr1-100132504-100133468,SASS6-Tss,TRMT13-Tss"
## [7] "chr1-1001736-1002236,ISG15-Tss"
## [8] "chr1-100249179-100250283,DBT-Tss"
## [9] "chr1-100265696-100266796,RTCA-Tss"
## [10] "chr1-10029041-10029541,UBE4B"
## create a new seurat obj for visualize and other analysis
seurat_obj.new <- runSeurat_Atac(mtx.ann, npc = 20, norm_by = 'tf-idf',
top_variable_features = 5000, reg.var = 'nCount_ATAC')
rm(mtx.ann)
seurat_obj.new <- RunUMAP(seurat_obj.new, dims = 1:20)
This method will take a while.
cisTopic.obj <- run_cisTopic(mtx, nCores = 5, topic = c(10, 20, 30, 50, 100),
frac_in_cell = 0.05)
## select the best model and lda as a new dimension reduction the seurat obj
sele.model <- cisTopic::selectModel(cisTopic.obj, select = nREDUCTION,
keepBinaryMatrix = F, keepModels = F)
cell_topic <- t(modelMatSelection(sele.model, 'cell', 'Probability'))
cl.labels = generalCluster(cell_topic, method = 'hclust', k = 8)
seurat_obj.new[['lda']] <- CreateDimReducObject(embeddings = cell_topic,
key = '_Topic', assay = DefaultAssay(seurat_obj.new))
## You can run umap or clustering on lda by specifying reduction='lda'
seurat_obj.new <- RunUMAP(seurat_obj.new, dims = 1:ncol(cell_topic), reduction = 'lda')
seurat_obj.new <- FindNeighbors(seurat_obj.new, reduction = 'lda', dims = 1:ncol(cell_topic))
seurat_obj.new <- FindClusters(seurat_obj.new, resolution = 0.4)
seurat_obj.new$active_clusters = seurat_obj.new$seurat_cluster
DimPlot(seurat_obj.new)
library(chromVAR)
library(chromVARmotifs)
## if you have already run motif analysis, chromvar obj was saved and
chromVar.obj <- readRDS(paste0(down_dir, '/chromVar_obj.rds'))
## otherwise
#chromVar.obj <- run_chromVAR(mtx, genomeName = 'BSgenome.Hsapiens.UCSC.hg38', ncore = 4)
zscore = chromVar.obj@assays@data$z
zscore = zscore[, colnames(zscore) %in% colnames(mtx)]
pca_coords = doDimReduction4mat(zscore, max_pc = 20)[[1]]
cl.labels = cutree(hclust(dist(pca_coords)), k = 8)
This is the original LSI method (the first PC was discarded), and you can specify different number of PCs and filter peaks
Compare TF enrichment among clusters. You can rerun chromVar or using original chromVar object (saved in .rds) and using updated clustering results.
library(chromVAR)
library(chromVARmotifs)
## if you have already run motif analysis, chromvar obj was saved and
chromVar.obj <- readRDS(paste0(down_dir, '/chromVar_obj.rds'))
## otherwise to re-run chromvar
#chromVar.obj <- run_chromVAR(mtx, genomeName = 'BSgenome.Hsapiens.UCSC.hg38', ncore = 4)
zscores = chromVar.obj@assays@data$z
dscores = chromVar.obj@assays@data$deviations
dscores = dscores[, colnames(dscores) %in% colnames(mtx)]
zscores = zscores[, colnames(zscores) %in% colnames(mtx)]
bc_clusters = data.table('barcode' = colnames(mtx),
'cluster' = seurat_obj.new$active_clusters)
tf.diff <- runDiffMotifEnrich(dscores, bc_clusters, topn = 10,
min_frac_per_cluster = 0.1, fdr = 0.01,
max_cell_per_clust = 300)
sele.zscores = zscores[tf.diff$feature, ]
## change rowname of zscores (tf name) to be readable
sele.zscores <- readable_tf(sele.zscores, 'hg38')
sele.zscores = sele.zscores[!duplicated(sele.zscores), ]
plot_enrich_tf(sele.zscores, bc_clusters)
We perform the differential accessibility (DA) analysis using FindMarkers function in Seurat V3. We can compare any pair of two groups or one-vs-rest comparison:
## comparing cluster1 vs cluster2 for example:
Idents(seurat_obj.new) = as.character(seurat_obj.new$active_clusters)
diff.peaks = FindMarkers(seurat_obj.new, slot = 'data',
ident.1 = c('1'), ident.2 = c('2'),
test.use = 'wilcox', verbose = F,
max.cells.per.ident = 500,
logfc.threshold = 0.0, min.pct = 0.1,
only.pos = F, latent.vars = NULL)
diff.peaks = data.table(diff.peaks, keep.rownames = T)
diff.peaks = subset(diff.peaks, p_val_adj < 0.01)
names(diff.peaks)[1] = 'peak'
names(diff.peaks)[grepl(names(diff.peaks), pattern = 'avg_log')] = 'avg_log2FC'
group1.high.peaks = diff.peaks[avg_log2FC > 0]$peak
group2.high.peaks = diff.peaks[avg_log2FC < 0]$peak
Idents(seurat_obj.new) = as.character(seurat_obj.new$active_clusters)
diff.peaks = FindAllMarkers(seurat_obj.new, slot = 'data',
test.use = 'wilcox', only.pos = T,
max.cells.per.ident = 500,
logfc.threshold = 0.0, min.pct = 0.1,
latent.vars = NULL)
diff.peaks = data.table(diff.peaks, keep.rownames = T)
diff.peaks = subset(diff.peaks, p_val_adj < 0.01)
names(diff.peaks)[1] = 'peak'
group1.high.peaks = diff.peaks[cluster == 1]$peak
For those peaks with differential accessibility, we can conduct GO enrichment analysis for each cluster taking advantage of annotated peaks. We use all genes associated with peaks as background genes. We conduct GO enrichment for cluster 1 as an example:
library(clusterProfiler)
## prepare genes for each cluster
group1.genes <- lapply(group1.high.peaks, function(x) unlist(strsplit(x, ','))[-1])
group1.genes = do.call('c', group1.genes)
group1.genes = unique(sapply(group1.genes, function(x) gsub('-Tss', '', x)))
## set background genes
bg.genes <- lapply(rownames(seurat_obj.new), function(x) unlist(strsplit(x, ','))[-1])
bg.genes = do.call('c', bg.genes)
bg.genes = unique(sapply(bg.genes, function(x) gsub('-Tss', '', x)))
## perform GO analysis per cluster
group1.go <- do_GO(group1.genes, bg_genes = bg.genes,
type = 'BP', organism = 'hsa')@result
group1.go <- data.table(group1.go)
group1.go = group1.go[qvalue < 0.05 & Count > 10, ]
We plot top 15 enriched terms for cluster 1, for example:
group1.go[, 'score' := -log10(p.adjust)]
group1.go = group1.go[order(-score), ]
ngo = min(15, nrow(group1.go))
group1.go = group1.go[1:ngo, ]
group1.go = group1.go[order(score), ]
group1.go$Description = factor(group1.go$Description, levels = group1.go$Description)
ggplot(group1.go, aes(y = score, x = Description, fill = Count)) +
geom_bar(width = 0.7, stat = 'identity') +
ggtitle("Enriched terms: cluster_1") + theme_classic() +
theme(legend.position = 'bottom', legend.direction = "horizontal") +
coord_flip() + scale_fill_continuous(name = "#genes", type = "viridis") +
xlab('') + ylab('-log10(p.adjust)')
For illustration purpose, I split the matrix into two sub-matrices and try to integrate them. The default integration method is VFACS (Variable Features Across ClusterS), which first pool the data and perform an initial clustering, followed by a reselecting variable features across the initial clusters; PCA, UMAP and clustering was conducted thereafter. Other integration method like ‘pool’, ‘seurat’ and ‘harmony’ can be specified through integrate_by.
#prepare a list of matrix
mtx_list = list(mtx[, 1:3000], mtx[, -(1:3000)])
seurat.integrate <- run_integration(mtx_list, integrate_by = 'VFACS',
top_variable_features = 5000,
norm_by = 'tf-idf', nREDUCTION = 30,
minFrac_in_cell = 0.01, min_depth = 1000,
max_depth = 50000, reg.var = 'nCount_ATAC',
resolution = 0.6)
DimPlot(seurat.integrate, group.by = 'sample')
Label trasfer (cell annotation) from cell annotation of scRNA-seq data. First construct a gene-by-cell activity matrix from scATAC-seq, then use FindTransferAnchors and TransferData function from Seurat R package to predicted cell type annotation from the cell annotaiton in scRNA-seq data. Here we prepared an annotated seurat object (seurat_rna4labelTransfer.rds) for 10x scRNA-seq for PBMC data.
# load your annotated rna seurat object (Cell_Type as the metadata for cell annotation)
seurat.rna = readRDS('pbmc10x_scATACpro_output/seurat_rna4labelTransfer.rds')
seurat.atac = readRDS(paste0(down_dir, 'seurat_obj.rds'))
# load your gene annotation
library(EnsDb.Hsapiens.v86) ## for hg38 genome
ens.ann <- EnsDb.Hsapiens.v86
gene_ann <- ensembldb::genes(ens.ann)
gene_ann <- keepStandardChromosomes(gene_ann, pruning.mode = 'coarse')
gene_ann = data.frame(gene_ann)
gene_ann = subset(gene_ann, gene_biotype %in% c('protein_coding', 'miRNA', 'lincRNA'),
select = c('seqnames', 'start', 'end',
'strand', 'gene_biotype', 'gene_name'))
names(gene_ann)[1:3] = c('chr', 'gene_start', 'gene_end')
gene_ann = data.table(gene_ann)
gene_ann = gene_ann[!duplicated(gene_name)]
gene_ann$chr = paste0('chr', gene_ann$chr)
gene_ann = subset(gene_ann, select = c('chr', 'gene_start', 'gene_end',
'strand', 'gene_name'))
## do the label transfer:
seurat.atac <- labelTransfer_R(seurat.atac, seurat.rna,
gene_ann, rna_ann_var = 'Cell_Type',
include_genebody = T)
# plot
p1 <- DimPlot(seurat.atac, group.by = "Predicted_Cell_Type",
label = TRUE, repel = TRUE) + ggtitle("scATAC-seq")
p2 <- DimPlot(seurat.rna, group.by = "Cell_Type",
label = TRUE, repel = TRUE) + ggtitle("scRNA-seq")
gridExtra::grid.arrange(p1, p2, nrow = 1)
sessionInfo()
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux 8.2 (Ootpa)
##
## Matrix products: default
## BLAS: /cm/shared/apps_chop/R/4.0.5/lib64/R/lib/libRblas.so
## LAPACK: /cm/shared/apps_chop/R/4.0.5/lib64/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 compiler stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] EnsDb.Hsapiens.v86_2.99.0 ensembldb_2.14.0
## [3] AnnotationFilter_1.14.0 GenomicFeatures_1.42.3
## [5] org.Hs.eg.db_3.12.0 AnnotationDbi_1.52.0
## [7] clusterProfiler_3.18.1 chromVARmotifs_0.2.0
## [9] chromVAR_1.12.0 mclust_5.4.7
## [11] edgeR_3.32.1 limma_3.46.0
## [13] BiocParallel_1.24.1 SummarizedExperiment_1.20.0
## [15] Biobase_2.50.0 GenomicRanges_1.42.0
## [17] GenomeInfoDb_1.26.7 IRanges_2.24.1
## [19] S4Vectors_0.28.1 BiocGenerics_0.36.0
## [21] MatrixGenerics_1.2.1 Matrix_1.3-4
## [23] matrixStats_0.58.0 readr_1.4.0
## [25] viridis_0.6.0 viridisLite_0.4.0
## [27] RColorBrewer_1.1-2 SeuratObject_4.0.2
## [29] Seurat_4.0.4 ggplot2_3.3.3
## [31] magrittr_2.0.1 data.table_1.14.0
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.3 rtracklayer_1.50.0
## [3] scattermore_0.7 R.methodsS3_1.8.1
## [5] tidyr_1.1.3 bit64_4.0.5
## [7] knitr_1.32 irlba_2.3.3
## [9] DelayedArray_0.16.3 R.utils_2.10.1
## [11] rpart_4.1-15 KEGGREST_1.30.1
## [13] TFBSTools_1.28.0 RCurl_1.98-1.3
## [15] generics_0.1.0 callr_3.6.0
## [17] cowplot_1.1.1 usethis_2.0.1
## [19] RSQLite_2.2.6 shadowtext_0.0.7
## [21] RANN_2.6.1 future_1.22.1
## [23] bit_4.0.4 enrichplot_1.10.2
## [25] xml2_1.3.2 spatstat.data_2.1-0
## [27] httpuv_1.6.3 assertthat_0.2.1
## [29] DirichletMultinomial_1.32.0 xfun_0.22
## [31] hms_1.0.0 jquerylib_0.1.3
## [33] evaluate_0.14 promises_1.2.0.1
## [35] progress_1.2.2 fansi_0.4.2
## [37] dbplyr_2.1.1 caTools_1.18.2
## [39] igraph_1.2.6 DBI_1.1.1
## [41] htmlwidgets_1.5.4 spatstat.geom_2.0-1
## [43] purrr_0.3.4 ellipsis_0.3.2
## [45] RSpectra_0.16-0 dplyr_1.0.5
## [47] annotate_1.68.0 biomaRt_2.46.3
## [49] deldir_0.2-10 vctrs_0.3.7
## [51] remotes_2.3.0 ROCR_1.0-11
## [53] abind_1.4-5 cachem_1.0.4
## [55] withr_2.4.1 ggforce_0.3.3
## [57] BSgenome_1.58.0 sctransform_0.3.2
## [59] GenomicAlignments_1.26.0 prettyunits_1.1.1
## [61] goftest_1.2-2 cluster_2.1.1
## [63] DOSE_3.16.0 lazyeval_0.2.2
## [65] seqLogo_1.56.0 crayon_1.4.1
## [67] pkgconfig_2.0.3 labeling_0.4.2
## [69] tweenr_1.0.2 ProtGenerics_1.22.0
## [71] nlme_3.1-152 pkgload_1.2.1
## [73] devtools_2.4.0 rlang_0.4.12
## [75] globals_0.14.0 lifecycle_1.0.1
## [77] miniUI_0.1.1.1 downloader_0.4
## [79] BiocFileCache_1.14.0 rprojroot_2.0.2
## [81] polyclip_1.10-0 lmtest_0.9-38
## [83] zoo_1.8-9 ggridges_0.5.3
## [85] processx_3.5.1 pheatmap_1.0.12
## [87] png_0.1-7 bitops_1.0-6
## [89] R.oo_1.24.0 KernSmooth_2.23-18
## [91] Biostrings_2.58.0 blob_1.2.1
## [93] stringr_1.4.0 qvalue_2.22.0
## [95] parallelly_1.28.1 CNEr_1.26.0
## [97] scales_1.1.1 memoise_2.0.0
## [99] plyr_1.8.6 ica_1.0-2
## [101] zlibbioc_1.36.0 scatterpie_0.1.5
## [103] fitdistrplus_1.1-3 Rsamtools_2.6.0
## [105] cli_3.0.1 XVector_0.30.0
## [107] listenv_0.8.0 patchwork_1.1.1
## [109] pbapply_1.4-3 ps_1.6.0
## [111] MASS_7.3-53.1 mgcv_1.8-34
## [113] tidyselect_1.1.0 stringi_1.7.5
## [115] highr_0.8 yaml_2.2.1
## [117] GOSemSim_2.16.1 askpass_1.1
## [119] locfit_1.5-9.4 ggrepel_0.9.1
## [121] grid_4.0.5 sass_0.3.1
## [123] fastmatch_1.1-0 tools_4.0.5
## [125] future.apply_1.7.0 rstudioapi_0.13
## [127] TFMPvalue_0.0.8 gridExtra_2.3
## [129] farver_2.1.0 Rtsne_0.15
## [131] ggraph_2.0.5 BiocManager_1.30.15
## [133] digest_0.6.28 rvcheck_0.1.8
## [135] shiny_1.6.0 pracma_2.3.3
## [137] Rcpp_1.0.7 later_1.3.0
## [139] RcppAnnoy_0.0.18 httr_1.4.2
## [141] colorspace_2.0-0 XML_3.99-0.6
## [143] fs_1.5.0 tensor_1.5
## [145] reticulate_1.18 splines_4.0.5
## [147] uwot_0.1.10 spatstat.utils_2.1-0
## [149] graphlayouts_0.7.1 plotly_4.9.3
## [151] sessioninfo_1.1.1 xtable_1.8-4
## [153] jsonlite_1.7.2 poweRlaw_0.70.6
## [155] tidygraph_1.2.0 testthat_3.0.2
## [157] R6_2.5.1 pillar_1.6.0
## [159] htmltools_0.5.2 mime_0.12
## [161] glue_1.4.2 fastmap_1.1.0
## [163] DT_0.18 codetools_0.2-18
## [165] fgsea_1.16.0 pkgbuild_1.2.0
## [167] utf8_1.2.1 lattice_0.20-41
## [169] bslib_0.2.4 spatstat.sparse_2.0-0
## [171] tibble_3.1.0 curl_4.3.2
## [173] leiden_0.3.7 gtools_3.8.2
## [175] openssl_1.4.3 GO.db_3.12.1
## [177] survival_3.2-10 rmarkdown_2.7
## [179] desc_1.3.0 munsell_0.5.0
## [181] DO.db_2.9 GenomeInfoDbData_1.2.4
## [183] reshape2_1.4.4 gtable_0.3.0
## [185] spatstat.core_2.0-0