1 Introduction

scATAC-pro generates raw QC metrics in plain text format. This tutorial will tell users how to replot or extract those metrics. Download pbmc10x_scATACpro_output data to reproduce this tutorial.

1.1 Load scATAC-pro utility functions

library(data.table)
library(magrittr)
library(ggplot2)
library(Seurat)
library(RColorBrewer)
library(viridis)
library(compiler)
library(readr)
library(matrixStats)
library(Matrix)
library(kableExtra)
devtools::source_url("https://github.com/wbaopaul/scATAC-pro/blob/master/scripts/src/scATAC-pro_utils.R?raw=TRUE")

2 Access QC results

2.1 Set up parameters, and read QC outputs

output_dir = 'pbmc10x_scATACpro_output/'
qc_dir = paste0(output_dir, 'summary/')
configure_user_file = 'pbmc10x_scATACpro_output/configure_user.txt'

## read configure_users.txt file
read_conf_file(configure_user_file)

## read mapping stats
mapping_qc_file = paste0(qc_dir, OUTPUT_PREFIX,  '.MappingStats')
mapping_qc = fread(mapping_qc_file, header = F)

## read fragments file
fragments_file = paste0(qc_dir, OUTPUT_PREFIX,  '.fragments.tsv.gz')


frags = fread(fragments_file)
names(frags) = c('chr', 'start', 'end', 'bc', 'ndup')

## read mapping stats for cell barcodes
cell_mapping_qc_file = paste0(qc_dir, 'cell_barcodes.MappingStats')
cell_mapping_qc = fread(cell_mapping_qc_file, header = F)


## read qc per barcode file
bc_stat_file = paste0(qc_dir, OUTPUT_PREFIX, '.', PEAK_CALLER, '.qc_per_barcode.txt')
bc_stat = fread(bc_stat_file)

## read cell barcodes
cell_barcodes = fread(paste0(output_dir, '/filtered_matrix/', PEAK_CALLER, '/', CELL_CALLER, '/barcodes_doubletsRemoved.txt'), header = F)$V1

2.2 Access mapping QC for all aggregated data (including non-cell barcodes)

lib_complx = nrow(frags)/sum(frags$ndup)
lib_complx = paste0(100 * round(lib_complx, 3), '%')

mapping_qc$frac = round(mapping_qc$V2/mapping_qc$V2[1], 3)
mapping_qc$frac = paste0(100*mapping_qc$frac, '%')

mapping_qc = rbind(mapping_qc, data.frame(V1 ='Library Complexity (nonredudant fraction)', V2 = '', frac = lib_complx))

kable(mapping_qc, col.names = NULL, format = 'html', caption = paste('Sample:', OUTPUT_PREFIX)) %>%
  kable_styling("striped", full_width = F, position = 'left', font_size = 15)
Sample: pbmc10k
Total_Pairs 234252877 100%
Total_Pairs_Mapped 232439675 99.2%
Total_Uniq_Mapped 218826440 93.4%
Total_Mito_Mapped 498721 0.2%
Total_Dups 98960085 42.2%
Total_Pairs_MAPQ30 215148245 91.8%
Total_Mito_MAPQ30 397411 0.2%
Total_Dups_MAPQ30 94217407 40.2%
Library Complexity (nonredudant fraction) 63.6%

2.3 Cell/Peak Calling Summary

qc_sele = bc_stat[bc %in% cell_barcodes, ]
qc_nonsele = bc_stat[!bc %in% cell_barcodes, ]

peak_file = paste0(output_dir, 'peaks/', PEAK_CALLER, '/', OUTPUT_PREFIX, '_features_BlacklistRemoved.bed')
peaks = fread(peak_file)
ncells = length(cell_barcodes)
mapq30.frac.in.cell = paste0(round(sum(frags[bc %in% cell_barcodes]$ndup)/sum(frags$ndup), 3) * 100, '%')
#frac.in.cell = paste0(round(cell_mapping_qc$V2[2]/as.integer(as.character(mapping_qc$V2[2])), 3) * 100, '%')
med.frag.per.cell = round(median(qc_sele$total_frags))
frac.uniq = paste0(round(nrow(frags[bc %in% cell_barcodes])/sum(frags[bc %in% cell_barcodes]$ndup), 3) * 100, '%')
frac.in.peak = sum(bc_stat$total_frags * bc_stat$frac_peak)/sum(bc_stat$total_frags)
frac.in.peak = paste0(round(frac.in.peak, 3)*100, '%')
frac.in.cell = sum(qc_sele$total_frags)/sum(bc_stat$total_frags)
frac.in.cell = paste0(round(frac.in.cell, 3)*100, '%')
cell.table = data.frame(c(PEAK_CALLER, nrow(peaks),
                          frac.in.peak, 
                          CELL_CALLER,  paste0(ncells),
                          frac.in.cell,
                          paste0(med.frag.per.cell), 
                          frac.in.cell, mapq30.frac.in.cell)) 
rm(peaks)
rownames(cell.table) = c('Peak called by', 'Number of peaks', 
                         'Percentage of unique fragments in peaks',
                         'Cell called by', 'Estimated # of cells',
                         'Percentage of unique fragments in cells',
                         'Median # of unique fragments per cell', 
                         'Percentage of Mapped fragments in cells', 
                         'Percentage of MAPQ30 fragments in cells')  
kable(cell.table, row.names = T, col.names = NULL, format = 'html') %>%
  kable_styling("striped", full_width = F, position = 'left', font_size = 15)
Peak called by MACS2
Number of peaks 149864
Percentage of unique fragments in peaks 37.4%
Cell called by FILTER
Estimated # of cells 6395
Percentage of unique fragments in cells 58.4%
Median # of unique fragments per cell 10484
Percentage of Mapped fragments in cells 58.4%
Percentage of MAPQ30 fragments in cells 62.9%

2.4 Mapping QC for aggregated cell barcodes

if(CELL_MAP_QC){
    lib_complx = frac.uniq

    cell_mapping_qc$frac = round(cell_mapping_qc$V2/cell_mapping_qc$V2[1], 3)
    cell_mapping_qc$frac = paste0(100*cell_mapping_qc$frac, '%')

    cell_mapping_qc = rbind(cell_mapping_qc, data.frame(V1 ='Library Complexity (nonredudant fraction)', V2 = '', frac = lib_complx))

    kable(cell_mapping_qc, col.names = NULL, format = 'html') %>%
      kable_styling("striped", full_width = F, position = 'left', font_size = 15)
}
Total_Pairs 141500136 100%
Total_Pairs_Mapped 140843153 99.5%
Total_Uniq_Mapped 134746877 95.2%
Total_Mito_Mapped 227559 0.2%
Total_Dups 67560585 47.7%
Total_Pairs_MAPQ30 61916390 43.8%
Total_Mito_MAPQ30 70978 0.1%
Total_Dups_MAPQ30 30606046 21.6%
Library Complexity (nonredudant fraction) 57.5%

2.5 Access QC for single cells

A few plots and metrics for single cells

2.5.1 Total fragments VS fraction in peaks

bc_stat[, 'group' := ifelse(bc %in% cell_barcodes, 'cell', 'non-cell')]

library(ggplot2)
library(grid)
nsub_frags = min(15000, nrow(bc_stat))  ## downsample for scatter plot
bc_stat_down = bc_stat[sort(sample(1:nrow(bc_stat), nsub_frags)), ]
g <- ggplot(data = bc_stat_down, 
            aes(x = total_frags, y = frac_peak, col = group)) + 
  geom_point(size = 0.5) + scale_x_continuous(trans='log10') + theme_bw() +
      theme(legend.position = 'none', 
            legend.title=element_blank(),
            axis.text = element_text(size = 15, family = "Helvetica"),
            axis.title = element_text(size = 18, family = "Helvetica")) +
  xlab('Total #Unique Fragments') + ylab('Fraction in Peak')

text1 <- grobTree(textGrob("Cell", x=0.8,  y=0.93, hjust=0,
  gp=gpar(col='#1F78B4', fontsize=15, fontface = 'bold', fontfamily = "Helvetica")))
text2 <- grobTree(textGrob("Non-cell", x=0.8,  y=0.83, hjust=0,
  gp=gpar(col='#B2DF8A', fontsize=15, fontface = 'bold', fontfamily = "Helvetica")))

g + annotation_custom(text1) + annotation_custom(text2)+
  scale_color_manual(values = c('#1F78B4', '#B2DF8A'))

2.5.2 Distribution of Insert Size (All Barcodes)

fragments_len_file = paste0(output_dir, 'summary/', OUTPUT_PREFIX,  '.fragments.len.tsv.gz')
if(file.exists(fragments_len_file)) {
    frag_lens = fread(fragments_len_file)
    names(frag_lens) = 'isize'
}else{
    lens = frags$end - frags$start
    frag_lens = data.table('isize' = rep(lens, frags$ndup))
}
rm(frags)
frag_lens = frag_lens[sample(1:nrow(frag_lens), 1000000), ]
ggplot(data = frag_lens[isize < 800], aes(x = isize)) +
      geom_density(fill = 'lightblue') + xlab('Insert Size') + ylab('Density') + theme_bw() + theme(legend.title=element_blank(), 
                    legend.background = NULL, 
                    axis.text = element_text(size = 15, family = "Helvetica"), 
                    axis.title = element_text(size = 18, family = "Helvetica")) 

2.5.3 TSS enrichment score profile (Cell Barcodes)

## read tss enrichement matrix
tss_escore_file = paste0(output_dir, '/signal/cell_barcodes.MAPQ30.aggregated.mtx.gz')
if(!file.exists(tss_escore_file)) tss_escore_file = paste0(output_dir, '/signal/', OUTPUT_PREFIX, '.aggregated.mtx.gz')

set.cols = brewer.pal(n=5, name = 'Dark2')
tss.mat = fread(tss_escore_file)
tss.mat = tss.mat[, -c(1:6)]
tss.mat[is.na(tss.mat)] = 0
tss.escore = colSums(tss.mat)
ma <- function(x, n = 10){stats::filter(x, rep(1 / n, n), sides = 2)}
tss.escore = ma(tss.escore)
tss.escore = tss.escore[14:213]
df = data.table(index = 10*(-100:99), escore = tss.escore/tss.escore[1])

ggplot(data = df, aes(x = index, y = escore)) + geom_line(size = 1, col = set.cols[1])  + theme_bw() +
  xlab('Distance to TSS (bp)') + ylab('TSS enrichment score') + theme(legend.title=element_blank(),
      axis.text = element_text(size = 15, family = "Helvetica"), 
      axis.title = element_text(size = 18, family = "Helvetica")) 

2.5.3.1 Density plot of total number of unique fragments

bc_stat[, 'group' := ifelse(bc %in% cell_barcodes, 'cell', 'non-cell')]

p <- ggplot(data = bc_stat, aes(x = total_frags, fill = group)) + 
  geom_density() + scale_x_continuous(trans = 'log10') + theme_bw() +
  theme(legend.position='none', legend.title=element_blank(),
        axis.title = element_text(size = 18, family = "Helvetica"),
        axis.text = element_text(size = 15, family = "Helvetica")) + 
  xlab('Total #Unique Fragments') + ylab('Density') 

text1 <- grobTree(textGrob("Cell", x=0.8,  y=0.93, hjust=0,
  gp=gpar(col='#1F78B4', fontsize=15, fontface = 'bold', fontfamily = "Helvetica")))
text2 <- grobTree(textGrob("Non-cell", x=0.8,  y=0.83, hjust=0,
  gp=gpar(col='#B2DF8A', fontsize=15, fontface = 'bold', fontfamily = "Helvetica")))

p + annotation_custom(text1) + annotation_custom(text2) +
  scale_fill_manual(values = c('#1F78B4', '#B2DF8A'))

2.5.3.2 Overlapping with sequence annotated regions (Cell Barcodes)

qc_sele_df = data.table(frac = c(qc_sele$frac_peak, qc_sele$frac_tss, qc_sele$frac_promoter, qc_sele$frac_enh, qc_sele$frac_mito), 'type' = rep(c('Peaks', 'Tss', 'Promoter', 'Enhancer', 'Mito'), each = nrow(qc_sele)))

qc_sele_df$type = factor(qc_sele_df$type, levels = c('Peaks', 'Tss', 'Promoter', 'Enhancer', 'Mito'))

ggplot(data = qc_sele_df, aes(y = frac, x = type, fill = type)) + ylab('Fraction') + theme_bw() +
  geom_boxplot(outlier.size = 0.01, show.legend = FALSE) + 
  scale_fill_manual(values = brewer.pal(5, 'Paired')) +
  theme(legend.position = 'none', 
        axis.text = element_text(size = 18, family = "Helvetica"), 
        axis.title.x = element_blank(), 
        axis.title.y = element_text(size = 18, family = "Helvetica")) + xlab('') 

2.5.3.3 Overall statistics (Cell Barcodes)

frac_peak = sum(qc_sele$total_frags * qc_sele$frac_peak)/sum(qc_sele$total_frags)
frac_mito = sum(qc_sele$total_frags * qc_sele$frac_mito)/sum(qc_sele$total_frags)
frac_promoter = sum(qc_sele$total_frags * qc_sele$frac_promoter)/sum(qc_sele$total_frags)
frac_enh = sum(qc_sele$total_frags * qc_sele$frac_enhancer)/sum(qc_sele$total_frags)
frac_tss = sum(qc_sele$total_frags * qc_sele$frac_tss)/sum(qc_sele$total_frags)
frac_peak = median(qc_sele$frac_peak)
frac_mito = median(qc_sele$frac_mito)
frac_promoter = median(qc_sele$frac_promoter)
frac_enh = median(qc_sele$frac_enhancer)
frac_tss = median(qc_sele$frac_tss)
fracs = data.frame(c(frac_peak,  frac_promoter, frac_enh, frac_tss, frac_mito))
rownames(fracs) = c('Median % fragments overlapping peaks', 
                    'Median % fragments overlapping promoters', 
                    'Median % fragments overlapping Enhancers(ENCODE)', 
                    'Median % fragments overlapping TSS (+/-1Kb)', 
                    'Median % fragments in mitochondrial genome')
colnames(fracs) = 'pr'
fracs$pr = round(fracs$pr, 3)
fracs$pr = paste0(100*fracs$pr, '%')
kable(fracs, row.names = T, col.names = NULL) %>%
  kable_styling(full_width = F, position = 'left', font_size = 15)
Median % fragments overlapping peaks 49.5%
Median % fragments overlapping promoters 31.3%
Median % fragments overlapping Enhancers(ENCODE) 15.1%
Median % fragments overlapping TSS (+/-1Kb) 32.8%
Median % fragments in mitochondrial genome 0%