scATAC-pro generates results in plain texts, tables and .rds objects. This tutorial will access original downstream analysis results module by module, which was done by running command lines. We will use scATAC-pro outputs from 10x PBMC data as in the manuscript, except for the integrate module, where data from another study was used for illustration purpose. Download pbmc10x_scATACpro_output data to reproduce this tutorial.
library(data.table)
library(magrittr)
library(ggplot2)
library(Seurat)
library(RColorBrewer)
library(viridis)
library(compiler)
library(readr)
library(matrixStats)
library(Matrix)
library(kableExtra)
library(pheatmap)
devtools::source_url("https://github.com/wbaopaul/scATAC-pro/blob/master/scripts/src/scATAC-pro_utils.R?raw=TRUE")
mtx_path = paste0(output_dir, 'filtered_matrix/', PEAK_CALLER, '/', CELL_CALLER, '/matrix_doubletsRemoved.rds')
mtx = readRDS(mtx_path)
rownames(mtx)[1:10]
## [1] "chr1-100028739-100029280" "chr1-10003-10625"
## [3] "chr1-100034409-100034909" "chr1-100037608-100039203"
## [5] "chr1-100046036-100046536" "chr1-100046723-100047223"
## [7] "chr1-100047294-100047794" "chr1-1000611-1001111"
## [9] "chr1-100064771-100065271" "chr1-100065236-100065736"
colnames(mtx)[1:10]
## [1] "AAACGAAAGACACGGT" "AAACGAAAGAGGTGGG" "AAACGAAAGCACGTAG" "AAACGAAAGCGCCTAC"
## [5] "AAACGAAAGCTTTCCC" "AAACGAAAGGCGTCCT" "AAACGAAAGGCTTTAC" "AAACGAAAGTGATATG"
## [9] "AAACGAACAAACGACG" "AAACGAACAATTGCCA"
Clustering result was saved in .rds file as seurat_obj.rds and clustering label as one of the metadata
The GO term enrichment result was saved in a .xlsx file.
library(xlsx)
group1 = '0'
group2 = 'rest'
go_file = paste0(down_dir, 'enrichedGO_differential_accessible_features_', group1, '_vs_', group2, '.xlsx')
# get enriched terms for cluster0 for example and show top 20 terms
go_res = xlsx::read.xlsx(go_file, sheetName = 'cluster0')
go_res = data.table(go_res)
go_res[, 'score' := -log10(p.adjust)]
go_res = go_res[order(-score), ]
ngo = min(20, nrow(go_res))
go_res = go_res[1:ngo, ]
go_res = go_res[order(score), ]
go_res$Description = factor(go_res$Description, levels = go_res$Description)
p_go <- ggplot(go_res, aes(y = score, x = Description, fill = Count)) +
geom_bar(width = 0.7, stat = 'identity') +
ggtitle("Enriched terms: cluster_0") + theme_classic() +
theme(legend.position = 'bottom', legend.direction = "horizontal") +
coord_flip() + scale_fill_continuous(name = "#genes", type = "viridis") +
xlab('') + ylab('-log10(p.adjust)')
p_go
The analysis was done through chromVAR R package.
library(chromVAR)
GENOME_NAME = 'hg38'
metaData = seurat_obj@meta.data
chromVar.obj = readRDS(paste0(down_dir, '/chromVar_obj.rds'))
diff_tf_enrich_file = paste0(down_dir, '/differential_TF_motif_enriched_in_clusters.txt')
da.res = fread( paste0(down_dir, '/differential_TF_motif_enriched_in_clusters.tsv'))
## plot enriched TFs in heatmap
sele.tfs = da.res$feature
zscores = deviationScores(chromVar.obj)
sele.zscores = zscores[sele.tfs, ]
## change rowname of zscores (tf name) to be readable
sele.zscores <- readable_tf(sele.zscores, GENOME_NAME)
metaData$active_clusters = as.character(metaData$active_clusters)
sele.zscores = sele.zscores[!duplicated(sele.zscores), ]
bc_clusters = data.table('barcode' = rownames(metaData),
'cluster' = metaData$active_clusters)
plot_enrich_tf(sele.zscores, bc_clusters)
da.res = runDiffMotifEnrich(deviations(chromVar.obj), bc_clusters,
min_frac_per_cluster = 0.1,
mmax_cell_per_clust = 500,
topn = 5)
sele.zscores = zscores[da.res$feature, ]
## change rowname of zscores (tf name) to be readable
sele.zscores <- readable_tf(sele.zscores, GENOME_NAME)
plot_enrich_tf(sele.zscores, bc_clusters)
The differential bound TFs were saved in a table of plain text format
group1_fp = '0'
group2_fp = '1'
footprint_stats.file = paste0(down_dir, '/differential_TF_footprint_',
group1_fp, '_vs_', group2_fp, '.txt')
if(file.exists(footprint_stats.file)){
footprint_out = fread(footprint_stats.file)
if(length(unique(footprint_out$motif)) > 100){
footprint_out[, 'N' := .N, by = higher_in_cluster]
cls = unique(footprint_out[N > 10]$higher_in_cluster)
if(length(cls) >= 1){
res0 = NULL
for(cl0 in cls){
tmp = footprint_out[higher_in_cluster == cl0]
tmp = tmp[order(P_values)][1:10, ]
res0 = rbind(res0, tmp)
}
footprint_out = rbind(footprint_out[N < 10], res0)
}
}
mm = reshape2::acast(motif ~ higher_in_cluster, data = footprint_out,
value.var = "P_values")
mm = -log10(mm)
mm[is.na(mm)] = 0
cn = colnames(mm)
cn.new = sapply(cn, function(x) gsub('_higher', '', x))
colnames(mm) = cn.new
mm[mm > 3] = 3
pheatmap(mm, cluster_cols = F, fontsize = 13, fontsize_row = 9,
color = viridis::viridis(100))
}
The cis-interactions were saved in a plain text file. Here we gonna read the interactions and plot the cis-loop within an interested region. You can change parameter Cicero_Plot_Region below to your interested genomic region:
library(cicero)
cicero_conn.file = paste0(down_dir, '/cicero_interactions.txt')
Cicero_Plot_Region = 'chr5:140610000-140640000'
if(file.exists(cicero_conn.file)){
conns = fread(cicero_conn.file)
conns = data.frame(conns)
temp <- tempfile()
if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) {
download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz', temp)
}
if(grepl(GENOME_NAME, pattern = 'mm9', ignore.case = T)) {
download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', temp)
}
if(grepl(GENOME_NAME, pattern = 'hg38', ignore.case = T)) {
download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz', temp)
}
if(grepl(GENOME_NAME, pattern = 'hg19', ignore.case = T)) {
download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz', temp)
}
gene_anno <- rtracklayer::readGFF(temp)
unlink(temp)
# rename some columns to match requirements
gene_anno$chromosome <- paste0("chr", gene_anno$seqid)
gene_anno$gene <- gene_anno$gene_id
gene_anno$transcript <- gene_anno$transcript_id
gene_anno$symbol <- gene_anno$gene_name
gene_anno = subset(gene_anno, select = c(chromosome, start, end, strand,
transcript, gene, symbol))
gene_anno = gene_anno[complete.cases(gene_anno), ]
chr0 = unlist(strsplit(Cicero_Plot_Region, ':'))[1] ## chr5:140610000-140640000
region0 = unlist(strsplit(Cicero_Plot_Region, ':'))[2]
start0 = as.integer(unlist(strsplit(region0, '-'))[1])
end0 = as.integer(unlist(strsplit(region0, '-'))[2])
cicero::plot_connections(conns, chr0, start0, end0,
gene_model = gene_anno,
coaccess_cutoff = .3,
connection_width = 1,
collapseTranscripts = "longest",
viewpoint_alpha = 0)
}