scATAC-pro generates raw QC metrics in plain text format. This tutorial will tell users how to replot or extract those metrics. Download pbmc10x_scATACpro_output data to reproduce this tutorial.
library(data.table)
library(magrittr)
library(ggplot2)
library(Seurat)
library(RColorBrewer)
library(viridis)
library(compiler)
library(readr)
library(matrixStats)
library(Matrix)
library(kableExtra)
devtools::source_url("https://github.com/wbaopaul/scATAC-pro/blob/master/scripts/src/scATAC-pro_utils.R?raw=TRUE")
output_dir = 'pbmc10x_scATACpro_output/'
qc_dir = paste0(output_dir, 'summary/')
configure_user_file = 'pbmc10x_scATACpro_output/configure_user.txt'
## read configure_users.txt file
read_conf_file(configure_user_file)
## read mapping stats
mapping_qc_file = paste0(qc_dir, OUTPUT_PREFIX, '.MappingStats')
mapping_qc = fread(mapping_qc_file, header = F)
## read fragments file
fragments_file = paste0(qc_dir, OUTPUT_PREFIX, '.fragments.tsv.gz')
frags = fread(fragments_file)
names(frags) = c('chr', 'start', 'end', 'bc', 'ndup')
## read mapping stats for cell barcodes
cell_mapping_qc_file = paste0(qc_dir, 'cell_barcodes.MappingStats')
cell_mapping_qc = fread(cell_mapping_qc_file, header = F)
## read qc per barcode file
bc_stat_file = paste0(qc_dir, OUTPUT_PREFIX, '.', PEAK_CALLER, '.qc_per_barcode.txt')
bc_stat = fread(bc_stat_file)
## read cell barcodes
cell_barcodes = fread(paste0(output_dir, '/filtered_matrix/', PEAK_CALLER, '/', CELL_CALLER, '/barcodes_doubletsRemoved.txt'), header = F)$V1
lib_complx = nrow(frags)/sum(frags$ndup)
lib_complx = paste0(100 * round(lib_complx, 3), '%')
mapping_qc$frac = round(mapping_qc$V2/mapping_qc$V2[1], 3)
mapping_qc$frac = paste0(100*mapping_qc$frac, '%')
mapping_qc = rbind(mapping_qc, data.frame(V1 ='Library Complexity (nonredudant fraction)', V2 = '', frac = lib_complx))
kable(mapping_qc, col.names = NULL, format = 'html', caption = paste('Sample:', OUTPUT_PREFIX)) %>%
kable_styling("striped", full_width = F, position = 'left', font_size = 15)
Total_Pairs | 234252877 | 100% |
Total_Pairs_Mapped | 232439675 | 99.2% |
Total_Uniq_Mapped | 218826440 | 93.4% |
Total_Mito_Mapped | 498721 | 0.2% |
Total_Dups | 98960085 | 42.2% |
Total_Pairs_MAPQ30 | 215148245 | 91.8% |
Total_Mito_MAPQ30 | 397411 | 0.2% |
Total_Dups_MAPQ30 | 94217407 | 40.2% |
Library Complexity (nonredudant fraction) | 63.6% |
qc_sele = bc_stat[bc %in% cell_barcodes, ]
qc_nonsele = bc_stat[!bc %in% cell_barcodes, ]
peak_file = paste0(output_dir, 'peaks/', PEAK_CALLER, '/', OUTPUT_PREFIX, '_features_BlacklistRemoved.bed')
peaks = fread(peak_file)
ncells = length(cell_barcodes)
mapq30.frac.in.cell = paste0(round(sum(frags[bc %in% cell_barcodes]$ndup)/sum(frags$ndup), 3) * 100, '%')
#frac.in.cell = paste0(round(cell_mapping_qc$V2[2]/as.integer(as.character(mapping_qc$V2[2])), 3) * 100, '%')
med.frag.per.cell = round(median(qc_sele$total_frags))
frac.uniq = paste0(round(nrow(frags[bc %in% cell_barcodes])/sum(frags[bc %in% cell_barcodes]$ndup), 3) * 100, '%')
frac.in.peak = sum(bc_stat$total_frags * bc_stat$frac_peak)/sum(bc_stat$total_frags)
frac.in.peak = paste0(round(frac.in.peak, 3)*100, '%')
frac.in.cell = sum(qc_sele$total_frags)/sum(bc_stat$total_frags)
frac.in.cell = paste0(round(frac.in.cell, 3)*100, '%')
cell.table = data.frame(c(PEAK_CALLER, nrow(peaks),
frac.in.peak,
CELL_CALLER, paste0(ncells),
frac.in.cell,
paste0(med.frag.per.cell),
frac.in.cell, mapq30.frac.in.cell))
rm(peaks)
rownames(cell.table) = c('Peak called by', 'Number of peaks',
'Percentage of unique fragments in peaks',
'Cell called by', 'Estimated # of cells',
'Percentage of unique fragments in cells',
'Median # of unique fragments per cell',
'Percentage of Mapped fragments in cells',
'Percentage of MAPQ30 fragments in cells')
kable(cell.table, row.names = T, col.names = NULL, format = 'html') %>%
kable_styling("striped", full_width = F, position = 'left', font_size = 15)
Peak called by | MACS2 |
Number of peaks | 149864 |
Percentage of unique fragments in peaks | 37.4% |
Cell called by | FILTER |
Estimated # of cells | 6395 |
Percentage of unique fragments in cells | 58.4% |
Median # of unique fragments per cell | 10484 |
Percentage of Mapped fragments in cells | 58.4% |
Percentage of MAPQ30 fragments in cells | 62.9% |
if(CELL_MAP_QC){
lib_complx = frac.uniq
cell_mapping_qc$frac = round(cell_mapping_qc$V2/cell_mapping_qc$V2[1], 3)
cell_mapping_qc$frac = paste0(100*cell_mapping_qc$frac, '%')
cell_mapping_qc = rbind(cell_mapping_qc, data.frame(V1 ='Library Complexity (nonredudant fraction)', V2 = '', frac = lib_complx))
kable(cell_mapping_qc, col.names = NULL, format = 'html') %>%
kable_styling("striped", full_width = F, position = 'left', font_size = 15)
}
Total_Pairs | 141500136 | 100% |
Total_Pairs_Mapped | 140843153 | 99.5% |
Total_Uniq_Mapped | 134746877 | 95.2% |
Total_Mito_Mapped | 227559 | 0.2% |
Total_Dups | 67560585 | 47.7% |
Total_Pairs_MAPQ30 | 61916390 | 43.8% |
Total_Mito_MAPQ30 | 70978 | 0.1% |
Total_Dups_MAPQ30 | 30606046 | 21.6% |
Library Complexity (nonredudant fraction) | 57.5% |
A few plots and metrics for single cells
bc_stat[, 'group' := ifelse(bc %in% cell_barcodes, 'cell', 'non-cell')]
library(ggplot2)
library(grid)
nsub_frags = min(15000, nrow(bc_stat)) ## downsample for scatter plot
bc_stat_down = bc_stat[sort(sample(1:nrow(bc_stat), nsub_frags)), ]
g <- ggplot(data = bc_stat_down,
aes(x = total_frags, y = frac_peak, col = group)) +
geom_point(size = 0.5) + scale_x_continuous(trans='log10') + theme_bw() +
theme(legend.position = 'none',
legend.title=element_blank(),
axis.text = element_text(size = 15, family = "Helvetica"),
axis.title = element_text(size = 18, family = "Helvetica")) +
xlab('Total #Unique Fragments') + ylab('Fraction in Peak')
text1 <- grobTree(textGrob("Cell", x=0.8, y=0.93, hjust=0,
gp=gpar(col='#1F78B4', fontsize=15, fontface = 'bold', fontfamily = "Helvetica")))
text2 <- grobTree(textGrob("Non-cell", x=0.8, y=0.83, hjust=0,
gp=gpar(col='#B2DF8A', fontsize=15, fontface = 'bold', fontfamily = "Helvetica")))
g + annotation_custom(text1) + annotation_custom(text2)+
scale_color_manual(values = c('#1F78B4', '#B2DF8A'))
fragments_len_file = paste0(output_dir, 'summary/', OUTPUT_PREFIX, '.fragments.len.tsv.gz')
if(file.exists(fragments_len_file)) {
frag_lens = fread(fragments_len_file)
names(frag_lens) = 'isize'
}else{
lens = frags$end - frags$start
frag_lens = data.table('isize' = rep(lens, frags$ndup))
}
rm(frags)
frag_lens = frag_lens[sample(1:nrow(frag_lens), 1000000), ]
ggplot(data = frag_lens[isize < 800], aes(x = isize)) +
geom_density(fill = 'lightblue') + xlab('Insert Size') + ylab('Density') + theme_bw() + theme(legend.title=element_blank(),
legend.background = NULL,
axis.text = element_text(size = 15, family = "Helvetica"),
axis.title = element_text(size = 18, family = "Helvetica"))
## read tss enrichement matrix
tss_escore_file = paste0(output_dir, '/signal/cell_barcodes.MAPQ30.aggregated.mtx.gz')
if(!file.exists(tss_escore_file)) tss_escore_file = paste0(output_dir, '/signal/', OUTPUT_PREFIX, '.aggregated.mtx.gz')
set.cols = brewer.pal(n=5, name = 'Dark2')
tss.mat = fread(tss_escore_file)
tss.mat = tss.mat[, -c(1:6)]
tss.mat[is.na(tss.mat)] = 0
tss.escore = colSums(tss.mat)
ma <- function(x, n = 10){stats::filter(x, rep(1 / n, n), sides = 2)}
tss.escore = ma(tss.escore)
tss.escore = tss.escore[14:213]
df = data.table(index = 10*(-100:99), escore = tss.escore/tss.escore[1])
ggplot(data = df, aes(x = index, y = escore)) + geom_line(size = 1, col = set.cols[1]) + theme_bw() +
xlab('Distance to TSS (bp)') + ylab('TSS enrichment score') + theme(legend.title=element_blank(),
axis.text = element_text(size = 15, family = "Helvetica"),
axis.title = element_text(size = 18, family = "Helvetica"))
bc_stat[, 'group' := ifelse(bc %in% cell_barcodes, 'cell', 'non-cell')]
p <- ggplot(data = bc_stat, aes(x = total_frags, fill = group)) +
geom_density() + scale_x_continuous(trans = 'log10') + theme_bw() +
theme(legend.position='none', legend.title=element_blank(),
axis.title = element_text(size = 18, family = "Helvetica"),
axis.text = element_text(size = 15, family = "Helvetica")) +
xlab('Total #Unique Fragments') + ylab('Density')
text1 <- grobTree(textGrob("Cell", x=0.8, y=0.93, hjust=0,
gp=gpar(col='#1F78B4', fontsize=15, fontface = 'bold', fontfamily = "Helvetica")))
text2 <- grobTree(textGrob("Non-cell", x=0.8, y=0.83, hjust=0,
gp=gpar(col='#B2DF8A', fontsize=15, fontface = 'bold', fontfamily = "Helvetica")))
p + annotation_custom(text1) + annotation_custom(text2) +
scale_fill_manual(values = c('#1F78B4', '#B2DF8A'))
qc_sele_df = data.table(frac = c(qc_sele$frac_peak, qc_sele$frac_tss, qc_sele$frac_promoter, qc_sele$frac_enh, qc_sele$frac_mito), 'type' = rep(c('Peaks', 'Tss', 'Promoter', 'Enhancer', 'Mito'), each = nrow(qc_sele)))
qc_sele_df$type = factor(qc_sele_df$type, levels = c('Peaks', 'Tss', 'Promoter', 'Enhancer', 'Mito'))
ggplot(data = qc_sele_df, aes(y = frac, x = type, fill = type)) + ylab('Fraction') + theme_bw() +
geom_boxplot(outlier.size = 0.01, show.legend = FALSE) +
scale_fill_manual(values = brewer.pal(5, 'Paired')) +
theme(legend.position = 'none',
axis.text = element_text(size = 18, family = "Helvetica"),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 18, family = "Helvetica")) + xlab('')
frac_peak = sum(qc_sele$total_frags * qc_sele$frac_peak)/sum(qc_sele$total_frags)
frac_mito = sum(qc_sele$total_frags * qc_sele$frac_mito)/sum(qc_sele$total_frags)
frac_promoter = sum(qc_sele$total_frags * qc_sele$frac_promoter)/sum(qc_sele$total_frags)
frac_enh = sum(qc_sele$total_frags * qc_sele$frac_enhancer)/sum(qc_sele$total_frags)
frac_tss = sum(qc_sele$total_frags * qc_sele$frac_tss)/sum(qc_sele$total_frags)
frac_peak = median(qc_sele$frac_peak)
frac_mito = median(qc_sele$frac_mito)
frac_promoter = median(qc_sele$frac_promoter)
frac_enh = median(qc_sele$frac_enhancer)
frac_tss = median(qc_sele$frac_tss)
fracs = data.frame(c(frac_peak, frac_promoter, frac_enh, frac_tss, frac_mito))
rownames(fracs) = c('Median % fragments overlapping peaks',
'Median % fragments overlapping promoters',
'Median % fragments overlapping Enhancers(ENCODE)',
'Median % fragments overlapping TSS (+/-1Kb)',
'Median % fragments in mitochondrial genome')
colnames(fracs) = 'pr'
fracs$pr = round(fracs$pr, 3)
fracs$pr = paste0(100*fracs$pr, '%')
kable(fracs, row.names = T, col.names = NULL) %>%
kable_styling(full_width = F, position = 'left', font_size = 15)
Median % fragments overlapping peaks | 49.5% |
Median % fragments overlapping promoters | 31.3% |
Median % fragments overlapping Enhancers(ENCODE) | 15.1% |
Median % fragments overlapping TSS (+/-1Kb) | 32.8% |
Median % fragments in mitochondrial genome | 0% |