scATAC-pro generates results in plain texts, tables and .rds objects. This tutorial will access original downstream analysis results module by module, which was done by running command lines. We will use scATAC-pro outputs from 10x PBMC data as in the manuscript, except for the integrate module, where data from another study was used for illustration purpose. Download pbmc10x_scATACpro_output data to reproduce this tutorial.
library(data.table)
library(magrittr)
library(ggplot2)
library(Seurat)
library(RColorBrewer)
library(viridis)
library(compiler)
library(readr)
library(matrixStats)
library(Matrix)
library(kableExtra)
library(pheatmap)
::source_url("https://github.com/wbaopaul/scATAC-pro/blob/master/scripts/src/scATAC-pro_utils.R?raw=TRUE") devtools
'MACS2'
PEAK_CALLER = 'FILTER'
CELL_CALLER = 'pbmc10x_scATACpro_output/'
output_dir = paste0(output_dir, 'downstream_analysis/', PEAK_CALLER, '/',
down_dir ='/')
CELL_CALLER, ::source_url("https://github.com/wbaopaul/scATAC-pro/blob/master/scripts/src/dsAnalysis_utilities.R?raw=TRUE") devtools
paste0(output_dir, 'filtered_matrix/', PEAK_CALLER, '/', CELL_CALLER, '/matrix_doubletsRemoved.rds')
mtx_path = readRDS(mtx_path)
mtx =rownames(mtx)[1:10]
## [1] "chr1-100028739-100029280" "chr1-10003-10625"
## [3] "chr1-100034409-100034909" "chr1-100037608-100039203"
## [5] "chr1-100046036-100046536" "chr1-100046723-100047223"
## [7] "chr1-100047294-100047794" "chr1-1000611-1001111"
## [9] "chr1-100064771-100065271" "chr1-100065236-100065736"
colnames(mtx)[1:10]
## [1] "AAACGAAAGACACGGT" "AAACGAAAGAGGTGGG" "AAACGAAAGCACGTAG" "AAACGAAAGCGCCTAC"
## [5] "AAACGAAAGCTTTCCC" "AAACGAAAGGCGTCCT" "AAACGAAAGGCTTTAC" "AAACGAAAGTGATATG"
## [9] "AAACGAACAAACGACG" "AAACGAACAATTGCCA"
Clustering result was saved in .rds file as seurat_obj.rds and clustering label as one of the metadata
readRDS(paste0(down_dir, 'seurat_obj.rds'))
seurat_obj =
## cluster label was saved in metadata active_clusters
table(seurat_obj$active_clusters)
##
## 0 1 2 3 4 5 6 7
## 2254 1742 862 574 386 381 163 33
## plot umap
DimPlot(seurat_obj, reduction = 'umap')
The GO term enrichment result was saved in a .xlsx file.
library(xlsx)
'0'
group1 = 'rest'
group2 = paste0(down_dir, 'enrichedGO_differential_accessible_features_', group1, '_vs_', group2, '.xlsx')
go_file =
# get enriched terms for cluster0 for example and show top 20 terms
xlsx::read.xlsx(go_file, sheetName = 'cluster0')
go_res =
data.table(go_res)
go_res ='score' := -log10(p.adjust)]
go_res[, go_res[order(-score), ]
go_res = min(20, nrow(go_res))
ngo = go_res[1:ngo, ]
go_res = go_res[order(score), ]
go_res =$Description = factor(go_res$Description, levels = go_res$Description)
go_res
ggplot(go_res, aes(y = score, x = Description, fill = Count)) +
p_go <- geom_bar(width = 0.7, stat = 'identity') +
ggtitle("Enriched terms: cluster_0") + theme_classic() +
theme(legend.position = 'bottom', legend.direction = "horizontal") +
coord_flip() + scale_fill_continuous(name = "#genes", type = "viridis") +
xlab('') + ylab('-log10(p.adjust)')
p_go
The analysis was done through chromVAR R package.
library(chromVAR)
'hg38'
GENOME_NAME = seurat_obj@meta.data
metaData =
readRDS(paste0(down_dir, '/chromVar_obj.rds'))
chromVar.obj = paste0(down_dir, '/differential_TF_motif_enriched_in_clusters.txt')
diff_tf_enrich_file =
fread( paste0(down_dir, '/differential_TF_motif_enriched_in_clusters.tsv'))
da.res =
## plot enriched TFs in heatmap
da.res$feature
sele.tfs = deviationScores(chromVar.obj)
zscores = zscores[sele.tfs, ]
sele.zscores =
## change rowname of zscores (tf name) to be readable
readable_tf(sele.zscores, GENOME_NAME)
sele.zscores <-
$active_clusters = as.character(metaData$active_clusters)
metaData
sele.zscores[!duplicated(sele.zscores), ]
sele.zscores =
data.table('barcode' = rownames(metaData),
bc_clusters ='cluster' = metaData$active_clusters)
plot_enrich_tf(sele.zscores, bc_clusters)
runDiffMotifEnrich(deviations(chromVar.obj), bc_clusters,
da.res =min_frac_per_cluster = 0.1,
mmax_cell_per_clust = 500,
topn = 5)
zscores[da.res$feature, ]
sele.zscores =
## change rowname of zscores (tf name) to be readable
readable_tf(sele.zscores, GENOME_NAME)
sele.zscores <-
plot_enrich_tf(sele.zscores, bc_clusters)
The differential bound TFs were saved in a table of plain text format
'0'
group1_fp = '1'
group2_fp = paste0(down_dir, '/differential_TF_footprint_',
footprint_stats.file ='_vs_', group2_fp, '.txt')
group1_fp, if(file.exists(footprint_stats.file)){
fread(footprint_stats.file)
footprint_out =if(length(unique(footprint_out$motif)) > 100){
'N' := .N, by = higher_in_cluster]
footprint_out[, unique(footprint_out[N > 10]$higher_in_cluster)
cls =if(length(cls) >= 1){
NULL
res0 =for(cl0 in cls){
footprint_out[higher_in_cluster == cl0]
tmp = tmp[order(P_values)][1:10, ]
tmp = rbind(res0, tmp)
res0 =
} rbind(footprint_out[N < 10], res0)
footprint_out =
}
}
reshape2::acast(motif ~ higher_in_cluster, data = footprint_out,
mm =value.var = "P_values")
-log10(mm)
mm =is.na(mm)] = 0
mm[ colnames(mm)
cn = sapply(cn, function(x) gsub('_higher', '', x))
cn.new =colnames(mm) = cn.new
> 3] = 3
mm[mm pheatmap(mm, cluster_cols = F, fontsize = 13, fontsize_row = 9,
color = viridis::viridis(100))
}
The cis-interactions were saved in a plain text file. Here we gonna read the interactions and plot the cis-loop within an interested region. You can change parameter Cicero_Plot_Region below to your interested genomic region:
library(cicero)
paste0(down_dir, '/cicero_interactions.txt')
cicero_conn.file = 'chr5:140610000-140640000'
Cicero_Plot_Region =if(file.exists(cicero_conn.file)){
fread(cicero_conn.file)
conns = data.frame(conns)
conns = tempfile()
temp <-if(grepl(GENOME_NAME, pattern = 'mm10', ignore.case = T)) {
download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/mus_musculus/Mus_musculus.GRCm38.95.gtf.gz', temp)
}
if(grepl(GENOME_NAME, pattern = 'mm9', ignore.case = T)) {
download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', temp)
}
if(grepl(GENOME_NAME, pattern = 'hg38', ignore.case = T)) {
download.file('ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz', temp)
}
if(grepl(GENOME_NAME, pattern = 'hg19', ignore.case = T)) {
download.file('ftp://ftp.ensembl.org/pub/release-67/gtf/homo_sapiens/Homo_sapiens.GRCh37.67.gtf.gz', temp)
}
rtracklayer::readGFF(temp)
gene_anno <-unlink(temp)
# rename some columns to match requirements
$chromosome <- paste0("chr", gene_anno$seqid)
gene_anno$gene <- gene_anno$gene_id
gene_anno$transcript <- gene_anno$transcript_id
gene_anno$symbol <- gene_anno$gene_name
gene_anno subset(gene_anno, select = c(chromosome, start, end, strand,
gene_anno =
transcript, gene, symbol)) gene_anno[complete.cases(gene_anno), ]
gene_anno =
unlist(strsplit(Cicero_Plot_Region, ':'))[1] ## chr5:140610000-140640000
chr0 = unlist(strsplit(Cicero_Plot_Region, ':'))[2]
region0 = as.integer(unlist(strsplit(region0, '-'))[1])
start0 = as.integer(unlist(strsplit(region0, '-'))[2])
end0 =
::plot_connections(conns, chr0, start0, end0,
cicerogene_model = gene_anno,
coaccess_cutoff = .3,
connection_width = 1,
collapseTranscripts = "longest",
viewpoint_alpha = 0)
}
readRDS(paste0(output_dir, 'integrated/seurat_obj_VFACS.rds'))
integrated_obj <-
DimPlot(integrated_obj, group.by = 'sample')
DimPlot(integrated_obj, group.by = 'active_clusters')