scATAC-pro generates raw QC metrics in plain text format. This tutorial will tell users how to replot or extract those metrics. Download pbmc10x_scATACpro_output data to reproduce this tutorial.
library(data.table)
library(magrittr)
library(ggplot2)
library(Seurat)
library(RColorBrewer)
library(viridis)
library(compiler)
library(readr)
library(matrixStats)
library(Matrix)
library(kableExtra)
::source_url("https://github.com/wbaopaul/scATAC-pro/blob/master/scripts/src/scATAC-pro_utils.R?raw=TRUE") devtools
'pbmc10x_scATACpro_output/'
output_dir = paste0(output_dir, 'summary/')
qc_dir = 'pbmc10x_scATACpro_output/configure_user.txt'
configure_user_file =
## read configure_users.txt file
read_conf_file(configure_user_file)
## read mapping stats
paste0(qc_dir, OUTPUT_PREFIX, '.MappingStats')
mapping_qc_file = fread(mapping_qc_file, header = F)
mapping_qc =
## read fragments file
paste0(qc_dir, OUTPUT_PREFIX, '.fragments.tsv.gz')
fragments_file =
fread(fragments_file)
frags =names(frags) = c('chr', 'start', 'end', 'bc', 'ndup')
## read mapping stats for cell barcodes
paste0(qc_dir, 'cell_barcodes.MappingStats')
cell_mapping_qc_file = fread(cell_mapping_qc_file, header = F)
cell_mapping_qc =
## read qc per barcode file
paste0(qc_dir, OUTPUT_PREFIX, '.', PEAK_CALLER, '.qc_per_barcode.txt')
bc_stat_file = fread(bc_stat_file)
bc_stat =
## read cell barcodes
fread(paste0(output_dir, '/filtered_matrix/', PEAK_CALLER, '/', CELL_CALLER, '/barcodes_doubletsRemoved.txt'), header = F)$V1 cell_barcodes =
nrow(frags)/sum(frags$ndup)
lib_complx = paste0(100 * round(lib_complx, 3), '%')
lib_complx =
$frac = round(mapping_qc$V2/mapping_qc$V2[1], 3)
mapping_qc$frac = paste0(100*mapping_qc$frac, '%')
mapping_qc
rbind(mapping_qc, data.frame(V1 ='Library Complexity (nonredudant fraction)', V2 = '', frac = lib_complx))
mapping_qc =
kable(mapping_qc, col.names = NULL, format = 'html', caption = paste('Sample:', OUTPUT_PREFIX)) %>%
kable_styling("striped", full_width = F, position = 'left', font_size = 15)
Total_Pairs | 234252877 | 100% |
Total_Pairs_Mapped | 232439675 | 99.2% |
Total_Uniq_Mapped | 218826440 | 93.4% |
Total_Mito_Mapped | 498721 | 0.2% |
Total_Dups | 98960085 | 42.2% |
Total_Pairs_MAPQ30 | 215148245 | 91.8% |
Total_Mito_MAPQ30 | 397411 | 0.2% |
Total_Dups_MAPQ30 | 94217407 | 40.2% |
Library Complexity (nonredudant fraction) | 63.6% |
bc_stat[bc %in% cell_barcodes, ]
qc_sele = bc_stat[!bc %in% cell_barcodes, ]
qc_nonsele =
paste0(output_dir, 'peaks/', PEAK_CALLER, '/', OUTPUT_PREFIX, '_features_BlacklistRemoved.bed')
peak_file = fread(peak_file)
peaks = length(cell_barcodes)
ncells = paste0(round(sum(frags[bc %in% cell_barcodes]$ndup)/sum(frags$ndup), 3) * 100, '%')
mapq30.frac.in.cell =#frac.in.cell = paste0(round(cell_mapping_qc$V2[2]/as.integer(as.character(mapping_qc$V2[2])), 3) * 100, '%')
round(median(qc_sele$total_frags))
med.frag.per.cell = paste0(round(nrow(frags[bc %in% cell_barcodes])/sum(frags[bc %in% cell_barcodes]$ndup), 3) * 100, '%')
frac.uniq = sum(bc_stat$total_frags * bc_stat$frac_peak)/sum(bc_stat$total_frags)
frac.in.peak = paste0(round(frac.in.peak, 3)*100, '%')
frac.in.peak = sum(qc_sele$total_frags)/sum(bc_stat$total_frags)
frac.in.cell = paste0(round(frac.in.cell, 3)*100, '%')
frac.in.cell = data.frame(c(PEAK_CALLER, nrow(peaks),
cell.table =
frac.in.peak, paste0(ncells),
CELL_CALLER,
frac.in.cell,paste0(med.frag.per.cell),
frac.in.cell, mapq30.frac.in.cell)) rm(peaks)
rownames(cell.table) = c('Peak called by', 'Number of peaks',
'Percentage of unique fragments in peaks',
'Cell called by', 'Estimated # of cells',
'Percentage of unique fragments in cells',
'Median # of unique fragments per cell',
'Percentage of Mapped fragments in cells',
'Percentage of MAPQ30 fragments in cells')
kable(cell.table, row.names = T, col.names = NULL, format = 'html') %>%
kable_styling("striped", full_width = F, position = 'left', font_size = 15)
Peak called by | MACS2 |
Number of peaks | 149864 |
Percentage of unique fragments in peaks | 37.4% |
Cell called by | FILTER |
Estimated # of cells | 6395 |
Percentage of unique fragments in cells | 58.4% |
Median # of unique fragments per cell | 10484 |
Percentage of Mapped fragments in cells | 58.4% |
Percentage of MAPQ30 fragments in cells | 62.9% |
if(CELL_MAP_QC){
frac.uniq
lib_complx =
$frac = round(cell_mapping_qc$V2/cell_mapping_qc$V2[1], 3)
cell_mapping_qc$frac = paste0(100*cell_mapping_qc$frac, '%')
cell_mapping_qc
rbind(cell_mapping_qc, data.frame(V1 ='Library Complexity (nonredudant fraction)', V2 = '', frac = lib_complx))
cell_mapping_qc =
kable(cell_mapping_qc, col.names = NULL, format = 'html') %>%
kable_styling("striped", full_width = F, position = 'left', font_size = 15)
}
Total_Pairs | 141500136 | 100% |
Total_Pairs_Mapped | 140843153 | 99.5% |
Total_Uniq_Mapped | 134746877 | 95.2% |
Total_Mito_Mapped | 227559 | 0.2% |
Total_Dups | 67560585 | 47.7% |
Total_Pairs_MAPQ30 | 61916390 | 43.8% |
Total_Mito_MAPQ30 | 70978 | 0.1% |
Total_Dups_MAPQ30 | 30606046 | 21.6% |
Library Complexity (nonredudant fraction) | 57.5% |
A few plots and metrics for single cells
'group' := ifelse(bc %in% cell_barcodes, 'cell', 'non-cell')]
bc_stat[,
library(ggplot2)
library(grid)
min(15000, nrow(bc_stat)) ## downsample for scatter plot
nsub_frags = bc_stat[sort(sample(1:nrow(bc_stat), nsub_frags)), ]
bc_stat_down = ggplot(data = bc_stat_down,
g <-aes(x = total_frags, y = frac_peak, col = group)) +
geom_point(size = 0.5) + scale_x_continuous(trans='log10') + theme_bw() +
theme(legend.position = 'none',
legend.title=element_blank(),
axis.text = element_text(size = 15, family = "Helvetica"),
axis.title = element_text(size = 18, family = "Helvetica")) +
xlab('Total #Unique Fragments') + ylab('Fraction in Peak')
grobTree(textGrob("Cell", x=0.8, y=0.93, hjust=0,
text1 <-gp=gpar(col='#1F78B4', fontsize=15, fontface = 'bold', fontfamily = "Helvetica")))
grobTree(textGrob("Non-cell", x=0.8, y=0.83, hjust=0,
text2 <-gp=gpar(col='#B2DF8A', fontsize=15, fontface = 'bold', fontfamily = "Helvetica")))
+ annotation_custom(text1) + annotation_custom(text2)+
g scale_color_manual(values = c('#1F78B4', '#B2DF8A'))
paste0(output_dir, 'summary/', OUTPUT_PREFIX, '.fragments.len.tsv.gz')
fragments_len_file =if(file.exists(fragments_len_file)) {
fread(fragments_len_file)
frag_lens =names(frag_lens) = 'isize'
else{
} frags$end - frags$start
lens = data.table('isize' = rep(lens, frags$ndup))
frag_lens =
}rm(frags)
frag_lens[sample(1:nrow(frag_lens), 1000000), ]
frag_lens =ggplot(data = frag_lens[isize < 800], aes(x = isize)) +
geom_density(fill = 'lightblue') + xlab('Insert Size') + ylab('Density') + theme_bw() + theme(legend.title=element_blank(),
legend.background = NULL,
axis.text = element_text(size = 15, family = "Helvetica"),
axis.title = element_text(size = 18, family = "Helvetica"))
## read tss enrichement matrix
paste0(output_dir, '/signal/cell_barcodes.MAPQ30.aggregated.mtx.gz')
tss_escore_file =if(!file.exists(tss_escore_file)) tss_escore_file = paste0(output_dir, '/signal/', OUTPUT_PREFIX, '.aggregated.mtx.gz')
brewer.pal(n=5, name = 'Dark2')
set.cols = fread(tss_escore_file)
tss.mat = tss.mat[, -c(1:6)]
tss.mat =is.na(tss.mat)] = 0
tss.mat[ colSums(tss.mat)
tss.escore = function(x, n = 10){stats::filter(x, rep(1 / n, n), sides = 2)}
ma <- ma(tss.escore)
tss.escore = tss.escore[14:213]
tss.escore = data.table(index = 10*(-100:99), escore = tss.escore/tss.escore[1])
df =
ggplot(data = df, aes(x = index, y = escore)) + geom_line(size = 1, col = set.cols[1]) + theme_bw() +
xlab('Distance to TSS (bp)') + ylab('TSS enrichment score') + theme(legend.title=element_blank(),
axis.text = element_text(size = 15, family = "Helvetica"),
axis.title = element_text(size = 18, family = "Helvetica"))
'group' := ifelse(bc %in% cell_barcodes, 'cell', 'non-cell')]
bc_stat[,
ggplot(data = bc_stat, aes(x = total_frags, fill = group)) +
p <- geom_density() + scale_x_continuous(trans = 'log10') + theme_bw() +
theme(legend.position='none', legend.title=element_blank(),
axis.title = element_text(size = 18, family = "Helvetica"),
axis.text = element_text(size = 15, family = "Helvetica")) +
xlab('Total #Unique Fragments') + ylab('Density')
grobTree(textGrob("Cell", x=0.8, y=0.93, hjust=0,
text1 <-gp=gpar(col='#1F78B4', fontsize=15, fontface = 'bold', fontfamily = "Helvetica")))
grobTree(textGrob("Non-cell", x=0.8, y=0.83, hjust=0,
text2 <-gp=gpar(col='#B2DF8A', fontsize=15, fontface = 'bold', fontfamily = "Helvetica")))
+ annotation_custom(text1) + annotation_custom(text2) +
p scale_fill_manual(values = c('#1F78B4', '#B2DF8A'))
data.table(frac = c(qc_sele$frac_peak, qc_sele$frac_tss, qc_sele$frac_promoter, qc_sele$frac_enh, qc_sele$frac_mito), 'type' = rep(c('Peaks', 'Tss', 'Promoter', 'Enhancer', 'Mito'), each = nrow(qc_sele)))
qc_sele_df =
$type = factor(qc_sele_df$type, levels = c('Peaks', 'Tss', 'Promoter', 'Enhancer', 'Mito'))
qc_sele_df
ggplot(data = qc_sele_df, aes(y = frac, x = type, fill = type)) + ylab('Fraction') + theme_bw() +
geom_boxplot(outlier.size = 0.01, show.legend = FALSE) +
scale_fill_manual(values = brewer.pal(5, 'Paired')) +
theme(legend.position = 'none',
axis.text = element_text(size = 18, family = "Helvetica"),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 18, family = "Helvetica")) + xlab('')
sum(qc_sele$total_frags * qc_sele$frac_peak)/sum(qc_sele$total_frags)
frac_peak = sum(qc_sele$total_frags * qc_sele$frac_mito)/sum(qc_sele$total_frags)
frac_mito = sum(qc_sele$total_frags * qc_sele$frac_promoter)/sum(qc_sele$total_frags)
frac_promoter = sum(qc_sele$total_frags * qc_sele$frac_enhancer)/sum(qc_sele$total_frags)
frac_enh = sum(qc_sele$total_frags * qc_sele$frac_tss)/sum(qc_sele$total_frags)
frac_tss = median(qc_sele$frac_peak)
frac_peak = median(qc_sele$frac_mito)
frac_mito = median(qc_sele$frac_promoter)
frac_promoter = median(qc_sele$frac_enhancer)
frac_enh = median(qc_sele$frac_tss)
frac_tss = data.frame(c(frac_peak, frac_promoter, frac_enh, frac_tss, frac_mito))
fracs =rownames(fracs) = c('Median % fragments overlapping peaks',
'Median % fragments overlapping promoters',
'Median % fragments overlapping Enhancers(ENCODE)',
'Median % fragments overlapping TSS (+/-1Kb)',
'Median % fragments in mitochondrial genome')
colnames(fracs) = 'pr'
$pr = round(fracs$pr, 3)
fracs$pr = paste0(100*fracs$pr, '%')
fracskable(fracs, row.names = T, col.names = NULL) %>%
kable_styling(full_width = F, position = 'left', font_size = 15)
Median % fragments overlapping peaks | 49.5% |
Median % fragments overlapping promoters | 31.3% |
Median % fragments overlapping Enhancers(ENCODE) | 15.1% |
Median % fragments overlapping TSS (+/-1Kb) | 32.8% |
Median % fragments in mitochondrial genome | 0% |