Contents

1 Required packages and other preparations

library(knitr)
library(DChIPRep)
library(BiocStyle)
library(ggplot2)
library(plyr)
library(dplyr)
library(RColorBrewer)
library(stringr)
library(pheatmap)
library(openxlsx)
library(devtools)
library(biomaRt)
library(tidyr)
library(csaw)
library(DESeq2)
library(edgeR)
library(fdrtool)
library(reshape2)
library(smoothmest)
   Loading required package: MASS
   
   Attaching package: 'MASS'
   
   The following object is masked from 'package:biomaRt':
   
       select
   
   The following object is masked from 'package:dplyr':
   
       select

2 DChIPRep case study – Figure 1

We first load the data from a pre–saved DChIPRepResults object. We also import the sample annotation table than contains information on the samples used.

load("importedData.RData")

sampleTable_K4me2 <- read.csv("sampleTable_K4me2.csv")
kable(sampleTable_K4me2, format = "html")
ChIP Input upstream downstream condition sampleID
BY_conf_K4me2_1__ORF_count.txt BY_conf_WCE_1__ORF_count.txt 1000 1500 WT BY_K4me2_1
BY_conf_K4me2_2__ORF_count.txt BY_conf_WCE_2__ORF_count.txt 1000 1500 WT BY_K4me2_2
BY_conf_K4me2_3__ORF_count.txt BY_conf_WCE_3__ORF_count.txt 1000 1500 WT BY_K4me2_3
SET2_conf_K4me2_1__ORF_count.txt SET2_conf_WCE_1__ORF_count.txt 1000 1500 mutant BY_K4me2_1
SET2_conf_K4me2_2__ORF_count.txt SET2_conf_WCE_2__ORF_count.txt 1000 1500 mutant BY_K4me2_2
SET2_conf_K4me2_3__ORF_count.txt SET2_conf_WCE_3__ORF_count.txt 1000 1500 mutant BY_K4me2_3

Then we can perfom the testing and produce the TSS plot in Figure 1.

testDChIPRep <- runTesting(importedData, plotFDR = FALSE)
   gene-wise dispersion estimates
   mean-dispersion relationship
   final dispersion estimates
   Step 1... determine cutoff point
   Step 2... estimate parameters of null distribution and eta0
   Step 3... compute p-values and estimate empirical PDF/CDF
   Step 4... compute q-values and local fdr
plotSignificance(testDChIPRep)

### Get the number of bases with significant changes
resDChIPRep <- resultsDChIPRep(testDChIPRep)

table(resDChIPRep$lfdr < 0.2)
   
   FALSE  TRUE 
    1571   930
lfdrDChIPRep <- resDChIPRep$lfdr[1001:2501]
sum(lfdrDChIPRep < 0.2)
   [1] 906

3 A csaw/edgeR pipeline – Figure 2

We use edgeR based testing instead of DESeq2 for the testing. For this we extract the raw counts from the data and then run the tesing pipeline just like csaw does it, using the quasi–likelihood based testing pipeline implemented in edgeR.

Not that it is not possible to define a log–fold–change cutoff for testing in edgeR, so we use a post–hoc cutoff to determine significance.

We then produce a plot similar to Figure 1.

# extract counts
ccc <- counts(DESeq2Data(importedData))


condition <- colData(DESeq2Data(importedData))$condition
y <- DGEList(ccc, 
             group = condition)


design <- model.matrix(~ 0 +  condition)
colnames(design) <- levels(condition)
y <- estimateDisp(y, design = design)
plotBCV(y)

fit <- glmQLFit(y, design, robust=TRUE)
plotQLDisp(fit)

contrast <- makeContrasts(WT-mutant, levels=design)
res <- glmQLFTest(fit, contrast=contrast)

resEdgeR <- topTags(sort.by = "none", res, n=Inf)$table

FDRedgeR <- fdrtool(resEdgeR$PValue, statistic = "pvalue", plot = FALSE)
   Step 1... determine cutoff point
   Step 2... estimate parameters of null distribution and eta0
   Step 3... compute p-values and estimate empirical PDF/CDF
   Step 4... compute q-values and local fdr
resEdgeR$ldfr <- FDRedgeR$lfdr

table(resEdgeR$ldfr < 0.2)
   
   FALSE  TRUE 
    1157  1344
lfdrEdgeR <- FDRedgeR$lfdr[1000:2501]
sum(lfdrEdgeR < 0.2)
   [1] 1127
countsLog2 <- as.data.frame(log2(counts(DESeq2Data(importedData), norm = TRUE)))


sampleTable <- colData(DESeq2Data(importedData))

pos <-   seq(from =-unique(sampleTable$upstream)
                            , to = unique(sampleTable$downstream), by = 1)

suppressMessages(c.ggplot2 <- data.frame(pos = pos,
                        melt(countsLog2,
                                       variable.name= "sample",
                                       value.name= "PosSignal")))

### get mean per group for ratio plots
m.f <- function(x) { smhuber(x)$mu }
m.per.group <- t(aggregate.data.frame(t(countsLog2),
                                      by = list(sampleTable$condition), FUN = m.f )[,-1])
colnames(m.per.group) <-  levels(sampleTable$condition)


# introduce post hoc l2fc cutoff
idxSig <- resEdgeR$ldfr < 0.2 & (abs(resEdgeR$logFC / log(2)) > 0.05)


m.per.group <- as.data.frame(m.per.group)
m.per.group$pos <- pos
m.per.group$significant <- idxSig

dataGG <- gather(m.per.group,
       key = "experimental_Group_and_significance",
       value = "mean_log2_counts", 1:2)

dataGG$experimental_Group_and_significance <- ifelse(dataGG$significant,
                                                      "significant",
                                                     as.character(dataGG$experimental_Group_and_significance))

dataGG$experimental_Group_and_significance  <- mapvalues(dataGG$experimental_Group_and_significance,
          from = c("mutant", "WT", "significant"), to = c("Mutant", "Wild type", "Significant Difference"))

dataGG$experimental_Group_and_significance <- factor(dataGG$experimental_Group_and_significance)

pl <- (ggplot(data = dataGG, 
              aes_string(x = "pos",
                         y = "mean_log2_counts",
                         color = "experimental_Group_and_significance")) +
      geom_point() +
      labs(x ="Distance from TSS (bp)",
           y="Normalized counts (log2)") +
      scale_color_manual(values = c( "#d95f02", "#7570b3", "black"), name = "") +
      theme(panel.background = element_blank(), panel.grid.minor=element_blank(),
      axis.line = element_line(colour = "black", size = 0.5)))


pl

#svg(filename = 'K4me2_significance_edgeR.svg',height = 6, width = 15)
(pl + theme( axis.ticks.length = unit(10, "points"),
             axis.line = element_line(size = 1),
              axis.title = element_text(size = 16, face = "bold"),
             axis.text = element_text(size = 12, face = "bold"),
             plot.title = element_text(size = 16, face = "bold"),
             legend.position = c(0.4, .7),
             legend.key = element_rect(fill = "white"),
             legend.text = element_text(size = 16, face = "bold")
            )
+ ggtitle("Call for significantly enriched regions based on csaw/edgeR"))

#dev.off()

4 Output of sessionInfo()

sessionInfo()
   R version 3.2.2 (2015-08-14)
   Platform: x86_64-pc-linux-gnu (64-bit)
   Running under: CentOS release 6.5 (Final)
   
   locale:
    [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
    [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
    [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
    [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
    [9] LC_ADDRESS=C               LC_TELEPHONE=C            
   [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
   
   attached base packages:
   [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
   [8] methods   base     
   
   other attached packages:
    [1] smoothmest_0.1-2           MASS_7.3-45               
    [3] reshape2_1.4.1             fdrtool_1.2.15            
    [5] xtable_1.8-0               edgeR_3.12.0              
    [7] limma_3.26.3               DESeq2_1.10.1             
    [9] RcppArmadillo_0.6.400.2.2  Rcpp_0.12.2               
   [11] csaw_1.4.1                 SummarizedExperiment_1.0.2
   [13] Biobase_2.30.0             GenomicRanges_1.22.2      
   [15] GenomeInfoDb_1.6.1         IRanges_2.4.6             
   [17] S4Vectors_0.8.5            BiocGenerics_0.16.1       
   [19] tidyr_0.3.1                biomaRt_2.26.1            
   [21] devtools_1.9.1             openxlsx_3.0.0            
   [23] pheatmap_1.0.8             stringr_1.0.0             
   [25] RColorBrewer_1.1-2         ggplot2_2.0.0             
   [27] DChIPRep_1.0.3             dplyr_0.4.3               
   [29] plyr_1.8.3                 knitr_1.11                
   [31] BiocStyle_1.8.0           
   
   loaded via a namespace (and not attached):
    [1] splines_3.2.2           Formula_1.2-1           assertthat_0.1         
    [4] statmod_1.4.23          latticeExtra_0.6-26     Rsamtools_1.22.0       
    [7] yaml_2.1.13             RSQLite_1.0.0           lattice_0.20-33        
   [10] digest_0.6.8            XVector_0.10.0          colorspace_1.2-6       
   [13] htmltools_0.3           XML_3.98-1.3            genefilter_1.52.0      
   [16] zlibbioc_1.16.0         scales_0.3.0            BiocParallel_1.4.3     
   [19] annotate_1.48.0         GenomicFeatures_1.22.6  lazyeval_0.1.10        
   [22] nnet_7.3-11             survival_2.38-3         magrittr_1.5           
   [25] memoise_0.2.1           evaluate_0.8            foreign_0.8-66         
   [28] tools_3.2.2             formatR_1.2.1           munsell_0.4.2          
   [31] locfit_1.5-9.1          cluster_2.0.3           AnnotationDbi_1.32.3   
   [34] lambda.r_1.1.7          Biostrings_2.38.3       futile.logger_1.4.1    
   [37] grid_3.2.2              RCurl_1.95-4.7          labeling_0.3           
   [40] bitops_1.0-6            rmarkdown_0.9.2         codetools_0.2-14       
   [43] gtable_0.1.2            DBI_0.3.1               R6_2.1.1               
   [46] GenomicAlignments_1.6.1 gridExtra_2.0.0         rtracklayer_1.30.1     
   [49] Hmisc_3.17-1            futile.options_1.0.0    stringi_1.0-1          
   [52] geneplotter_1.48.0      rpart_4.1-10            acepack_1.3-3.3