1 Introduction

1.1 Load scATAC-pro utility functions

2 Access QC results

2.1 Set up parameters, and read QC outputs
2.2 Access mapping QC for all aggregated data (including non-cell barcodes)
2.3 Cell/Peak Calling Summary
2.4 Mapping QC for aggregated cell barcodes
2.5 Access QC for single cells

2.5.1 Total fragments VS fraction in peaks
2.5.2 Distribution of Insert Size (All Barcodes)
2.5.3 TSS enrichment score profile (Cell Barcodes)

2.5.3.1 Density plot of total number of unique fragments
2.5.3.2 Overlapping with sequence annotated regions (Cell Barcodes)
2.5.3.3 Overall statistics (Cell Barcodes)

1 Introduction

scATAC-pro generates raw QC metrics in plain text format. This tutorial will tell users how to replot or extract those metrics. Download pbmc10x_scATACpro_output data to reproduce this tutorial.

1.1 Load scATAC-pro utility functions

library(data.table)
library(magrittr)
library(ggplot2)
library(Seurat)
library(RColorBrewer)
library(viridis)
library(compiler)
library(readr)
library(matrixStats)
library(Matrix)
library(kableExtra)
devtools::source_url("https://github.com/wbaopaul/scATAC-pro/blob/master/scripts/src/scATAC-pro_utils.R?raw=TRUE")

2 Access QC results

2.1 Set up parameters, and read QC outputs

output_dir = 'pbmc10x_scATACpro_output/'
qc_dir = paste0(output_dir, 'summary/')
configure_user_file = 'pbmc10x_scATACpro_output/configure_user.txt'

## read configure_users.txt file
read_conf_file(configure_user_file)

## read mapping stats
mapping_qc_file = paste0(qc_dir, OUTPUT_PREFIX,  '.MappingStats')
mapping_qc = fread(mapping_qc_file, header = F)

## read fragments file
fragments_file = paste0(qc_dir, OUTPUT_PREFIX,  '.fragments.tsv.gz')


frags = fread(fragments_file)
names(frags) = c('chr', 'start', 'end', 'bc', 'ndup')

## read mapping stats for cell barcodes
cell_mapping_qc_file = paste0(qc_dir, 'cell_barcodes.MappingStats')
cell_mapping_qc = fread(cell_mapping_qc_file, header = F)


## read qc per barcode file
bc_stat_file = paste0(qc_dir, OUTPUT_PREFIX, '.', PEAK_CALLER, '.qc_per_barcode.txt')
bc_stat = fread(bc_stat_file)

## read cell barcodes
cell_barcodes = fread(paste0(output_dir, '/filtered_matrix/', PEAK_CALLER, '/', CELL_CALLER, '/barcodes_doubletsRemoved.txt'), header = F)$V1

2.2 Access mapping QC for all aggregated data (including non-cell barcodes)

lib_complx = nrow(frags)/sum(frags$ndup)
lib_complx = paste0(100 * round(lib_complx, 3), '%')

mapping_qc$frac = round(mapping_qc$V2/mapping_qc$V2[1], 3)
mapping_qc$frac = paste0(100*mapping_qc$frac, '%')

mapping_qc = rbind(mapping_qc, data.frame(V1 ='Library Complexity (nonredudant fraction)', V2 = '', frac = lib_complx))

kable(mapping_qc, col.names = NULL, format = 'html', caption = paste('Sample:', OUTPUT_PREFIX)) %>%
  kable_styling("striped", full_width = F, position = 'left', font_size = 15)

Sample: pbmc10k
Total_Pairs	234252877	100%
Total_Pairs_Mapped	232439675	99.2%
Total_Uniq_Mapped	218826440	93.4%
Total_Mito_Mapped	498721	0.2%
Total_Dups	98960085	42.2%
Total_Pairs_MAPQ30	215148245	91.8%
Total_Mito_MAPQ30	397411	0.2%
Total_Dups_MAPQ30	94217407	40.2%
Library Complexity (nonredudant fraction)		63.6%

2.3 Cell/Peak Calling Summary

qc_sele = bc_stat[bc %in% cell_barcodes, ]
qc_nonsele = bc_stat[!bc %in% cell_barcodes, ]

peak_file = paste0(output_dir, 'peaks/', PEAK_CALLER, '/', OUTPUT_PREFIX, '_features_BlacklistRemoved.bed')
peaks = fread(peak_file)
ncells = length(cell_barcodes)
mapq30.frac.in.cell = paste0(round(sum(frags[bc %in% cell_barcodes]$ndup)/sum(frags$ndup), 3) * 100, '%')
#frac.in.cell = paste0(round(cell_mapping_qc$V2[2]/as.integer(as.character(mapping_qc$V2[2])), 3) * 100, '%')
med.frag.per.cell = round(median(qc_sele$total_frags))
frac.uniq = paste0(round(nrow(frags[bc %in% cell_barcodes])/sum(frags[bc %in% cell_barcodes]$ndup), 3) * 100, '%')
frac.in.peak = sum(bc_stat$total_frags * bc_stat$frac_peak)/sum(bc_stat$total_frags)
frac.in.peak = paste0(round(frac.in.peak, 3)*100, '%')
frac.in.cell = sum(qc_sele$total_frags)/sum(bc_stat$total_frags)
frac.in.cell = paste0(round(frac.in.cell, 3)*100, '%')
cell.table = data.frame(c(PEAK_CALLER, nrow(peaks),
                          frac.in.peak, 
                          CELL_CALLER,  paste0(ncells),
                          frac.in.cell,
                          paste0(med.frag.per.cell), 
                          frac.in.cell, mapq30.frac.in.cell)) 
rm(peaks)
rownames(cell.table) = c('Peak called by', 'Number of peaks', 
                         'Percentage of unique fragments in peaks',
                         'Cell called by', 'Estimated # of cells',
                         'Percentage of unique fragments in cells',
                         'Median # of unique fragments per cell', 
                         'Percentage of Mapped fragments in cells', 
                         'Percentage of MAPQ30 fragments in cells')  
kable(cell.table, row.names = T, col.names = NULL, format = 'html') %>%
  kable_styling("striped", full_width = F, position = 'left', font_size = 15)

Peak called by	MACS2
Number of peaks	149864
Percentage of unique fragments in peaks	37.4%
Cell called by	FILTER
Estimated # of cells	6395
Percentage of unique fragments in cells	58.4%
Median # of unique fragments per cell	10484
Percentage of Mapped fragments in cells	58.4%
Percentage of MAPQ30 fragments in cells	62.9%

2.4 Mapping QC for aggregated cell barcodes

if(CELL_MAP_QC){
    lib_complx = frac.uniq

    cell_mapping_qc$frac = round(cell_mapping_qc$V2/cell_mapping_qc$V2[1], 3)
    cell_mapping_qc$frac = paste0(100*cell_mapping_qc$frac, '%')

    cell_mapping_qc = rbind(cell_mapping_qc, data.frame(V1 ='Library Complexity (nonredudant fraction)', V2 = '', frac = lib_complx))

    kable(cell_mapping_qc, col.names = NULL, format = 'html') %>%
      kable_styling("striped", full_width = F, position = 'left', font_size = 15)
}

Total_Pairs	141500136	100%
Total_Pairs_Mapped	140843153	99.5%
Total_Uniq_Mapped	134746877	95.2%
Total_Mito_Mapped	227559	0.2%
Total_Dups	67560585	47.7%
Total_Pairs_MAPQ30	61916390	43.8%
Total_Mito_MAPQ30	70978	0.1%
Total_Dups_MAPQ30	30606046	21.6%
Library Complexity (nonredudant fraction)		57.5%

2.5 Access QC for single cells

A few plots and metrics for single cells

2.5.1 Total fragments VS fraction in peaks

bc_stat[, 'group' := ifelse(bc %in% cell_barcodes, 'cell', 'non-cell')]

library(ggplot2)
library(grid)
nsub_frags = min(15000, nrow(bc_stat))  ## downsample for scatter plot
bc_stat_down = bc_stat[sort(sample(1:nrow(bc_stat), nsub_frags)), ]
g <- ggplot(data = bc_stat_down, 
            aes(x = total_frags, y = frac_peak, col = group)) + 
  geom_point(size = 0.5) + scale_x_continuous(trans='log10') + theme_bw() +
      theme(legend.position = 'none', 
            legend.title=element_blank(),
            axis.text = element_text(size = 15, family = "Helvetica"),
            axis.title = element_text(size = 18, family = "Helvetica")) +
  xlab('Total #Unique Fragments') + ylab('Fraction in Peak')

text1 <- grobTree(textGrob("Cell", x=0.8,  y=0.93, hjust=0,
  gp=gpar(col='#1F78B4', fontsize=15, fontface = 'bold', fontfamily = "Helvetica")))
text2 <- grobTree(textGrob("Non-cell", x=0.8,  y=0.83, hjust=0,
  gp=gpar(col='#B2DF8A', fontsize=15, fontface = 'bold', fontfamily = "Helvetica")))

g + annotation_custom(text1) + annotation_custom(text2)+
  scale_color_manual(values = c('#1F78B4', '#B2DF8A'))

2.5.2 Distribution of Insert Size (All Barcodes)

fragments_len_file = paste0(output_dir, 'summary/', OUTPUT_PREFIX,  '.fragments.len.tsv.gz')
if(file.exists(fragments_len_file)) {
    frag_lens = fread(fragments_len_file)
    names(frag_lens) = 'isize'
}else{
    lens = frags$end - frags$start
    frag_lens = data.table('isize' = rep(lens, frags$ndup))
}
rm(frags)
frag_lens = frag_lens[sample(1:nrow(frag_lens), 1000000), ]
ggplot(data = frag_lens[isize < 800], aes(x = isize)) +
      geom_density(fill = 'lightblue') + xlab('Insert Size') + ylab('Density') + theme_bw() + theme(legend.title=element_blank(), 
                    legend.background = NULL, 
                    axis.text = element_text(size = 15, family = "Helvetica"), 
                    axis.title = element_text(size = 18, family = "Helvetica"))

2.5.3 TSS enrichment score profile (Cell Barcodes)

## read tss enrichement matrix
tss_escore_file = paste0(output_dir, '/signal/cell_barcodes.MAPQ30.aggregated.mtx.gz')
if(!file.exists(tss_escore_file)) tss_escore_file = paste0(output_dir, '/signal/', OUTPUT_PREFIX, '.aggregated.mtx.gz')

set.cols = brewer.pal(n=5, name = 'Dark2')
tss.mat = fread(tss_escore_file)
tss.mat = tss.mat[, -c(1:6)]
tss.mat[is.na(tss.mat)] = 0
tss.escore = colSums(tss.mat)
ma <- function(x, n = 10){stats::filter(x, rep(1 / n, n), sides = 2)}
tss.escore = ma(tss.escore)
tss.escore = tss.escore[14:213]
df = data.table(index = 10*(-100:99), escore = tss.escore/tss.escore[1])

ggplot(data = df, aes(x = index, y = escore)) + geom_line(size = 1, col = set.cols[1])  + theme_bw() +
  xlab('Distance to TSS (bp)') + ylab('TSS enrichment score') + theme(legend.title=element_blank(),
      axis.text = element_text(size = 15, family = "Helvetica"), 
      axis.title = element_text(size = 18, family = "Helvetica"))

2.5.3.1 Density plot of total number of unique fragments

bc_stat[, 'group' := ifelse(bc %in% cell_barcodes, 'cell', 'non-cell')]

p <- ggplot(data = bc_stat, aes(x = total_frags, fill = group)) + 
  geom_density() + scale_x_continuous(trans = 'log10') + theme_bw() +
  theme(legend.position='none', legend.title=element_blank(),
        axis.title = element_text(size = 18, family = "Helvetica"),
        axis.text = element_text(size = 15, family = "Helvetica")) + 
  xlab('Total #Unique Fragments') + ylab('Density') 

text1 <- grobTree(textGrob("Cell", x=0.8,  y=0.93, hjust=0,
  gp=gpar(col='#1F78B4', fontsize=15, fontface = 'bold', fontfamily = "Helvetica")))
text2 <- grobTree(textGrob("Non-cell", x=0.8,  y=0.83, hjust=0,
  gp=gpar(col='#B2DF8A', fontsize=15, fontface = 'bold', fontfamily = "Helvetica")))

p + annotation_custom(text1) + annotation_custom(text2) +
  scale_fill_manual(values = c('#1F78B4', '#B2DF8A'))

2.5.3.2 Overlapping with sequence annotated regions (Cell Barcodes)

qc_sele_df = data.table(frac = c(qc_sele$frac_peak, qc_sele$frac_tss, qc_sele$frac_promoter, qc_sele$frac_enh, qc_sele$frac_mito), 'type' = rep(c('Peaks', 'Tss', 'Promoter', 'Enhancer', 'Mito'), each = nrow(qc_sele)))

qc_sele_df$type = factor(qc_sele_df$type, levels = c('Peaks', 'Tss', 'Promoter', 'Enhancer', 'Mito'))

ggplot(data = qc_sele_df, aes(y = frac, x = type, fill = type)) + ylab('Fraction') + theme_bw() +
  geom_boxplot(outlier.size = 0.01, show.legend = FALSE) + 
  scale_fill_manual(values = brewer.pal(5, 'Paired')) +
  theme(legend.position = 'none', 
        axis.text = element_text(size = 18, family = "Helvetica"), 
        axis.title.x = element_blank(), 
        axis.title.y = element_text(size = 18, family = "Helvetica")) + xlab('')

2.5.3.3 Overall statistics (Cell Barcodes)

frac_peak = sum(qc_sele$total_frags * qc_sele$frac_peak)/sum(qc_sele$total_frags)
frac_mito = sum(qc_sele$total_frags * qc_sele$frac_mito)/sum(qc_sele$total_frags)
frac_promoter = sum(qc_sele$total_frags * qc_sele$frac_promoter)/sum(qc_sele$total_frags)
frac_enh = sum(qc_sele$total_frags * qc_sele$frac_enhancer)/sum(qc_sele$total_frags)
frac_tss = sum(qc_sele$total_frags * qc_sele$frac_tss)/sum(qc_sele$total_frags)
frac_peak = median(qc_sele$frac_peak)
frac_mito = median(qc_sele$frac_mito)
frac_promoter = median(qc_sele$frac_promoter)
frac_enh = median(qc_sele$frac_enhancer)
frac_tss = median(qc_sele$frac_tss)
fracs = data.frame(c(frac_peak,  frac_promoter, frac_enh, frac_tss, frac_mito))
rownames(fracs) = c('Median % fragments overlapping peaks', 
                    'Median % fragments overlapping promoters', 
                    'Median % fragments overlapping Enhancers(ENCODE)', 
                    'Median % fragments overlapping TSS (+/-1Kb)', 
                    'Median % fragments in mitochondrial genome')
colnames(fracs) = 'pr'
fracs$pr = round(fracs$pr, 3)
fracs$pr = paste0(100*fracs$pr, '%')
kable(fracs, row.names = T, col.names = NULL) %>%
  kable_styling(full_width = F, position = 'left', font_size = 15)

Median % fragments overlapping peaks	49.5%
Median % fragments overlapping promoters	31.3%
Median % fragments overlapping Enhancers(ENCODE)	15.1%
Median % fragments overlapping TSS (+/-1Kb)	32.8%
Median % fragments in mitochondrial genome	0%

Access scATAC-pro QC results in R