library(knitr)
library(DChIPRep)
library(BiocStyle)
library(ggplot2)
library(plyr)
library(dplyr)
library(RColorBrewer)
library(stringr)
library(pheatmap)
library(openxlsx)
library(devtools)
library(biomaRt)
library(tidyr)
library(csaw)
library(DESeq2)
library(edgeR)
library(fdrtool)
library(reshape2)
library(smoothmest)
Loading required package: MASS
Attaching package: 'MASS'
The following object is masked from 'package:biomaRt':
select
The following object is masked from 'package:dplyr':
select
We first load the data from a pre–saved DChIPRepResults
object. We also import the sample annotation table than contains information on the samples used.
load("importedData.RData")
sampleTable_K4me2 <- read.csv("sampleTable_K4me2.csv")
kable(sampleTable_K4me2, format = "html")
ChIP | Input | upstream | downstream | condition | sampleID |
---|---|---|---|---|---|
BY_conf_K4me2_1__ORF_count.txt | BY_conf_WCE_1__ORF_count.txt | 1000 | 1500 | WT | BY_K4me2_1 |
BY_conf_K4me2_2__ORF_count.txt | BY_conf_WCE_2__ORF_count.txt | 1000 | 1500 | WT | BY_K4me2_2 |
BY_conf_K4me2_3__ORF_count.txt | BY_conf_WCE_3__ORF_count.txt | 1000 | 1500 | WT | BY_K4me2_3 |
SET2_conf_K4me2_1__ORF_count.txt | SET2_conf_WCE_1__ORF_count.txt | 1000 | 1500 | mutant | BY_K4me2_1 |
SET2_conf_K4me2_2__ORF_count.txt | SET2_conf_WCE_2__ORF_count.txt | 1000 | 1500 | mutant | BY_K4me2_2 |
SET2_conf_K4me2_3__ORF_count.txt | SET2_conf_WCE_3__ORF_count.txt | 1000 | 1500 | mutant | BY_K4me2_3 |
Then we can perfom the testing and produce the TSS plot in Figure 1.
testDChIPRep <- runTesting(importedData, plotFDR = FALSE)
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
Step 1... determine cutoff point
Step 2... estimate parameters of null distribution and eta0
Step 3... compute p-values and estimate empirical PDF/CDF
Step 4... compute q-values and local fdr
plotSignificance(testDChIPRep)
### Get the number of bases with significant changes
resDChIPRep <- resultsDChIPRep(testDChIPRep)
table(resDChIPRep$lfdr < 0.2)
FALSE TRUE
1571 930
lfdrDChIPRep <- resDChIPRep$lfdr[1001:2501]
sum(lfdrDChIPRep < 0.2)
[1] 906
We use edgeR based testing instead of DESeq2 for the testing. For this we extract the raw counts from the data and then run the tesing pipeline just like csaw does it, using the quasi–likelihood based testing pipeline implemented in edgeR.
Not that it is not possible to define a log–fold–change cutoff for testing in edgeR, so we use a post–hoc cutoff to determine significance.
We then produce a plot similar to Figure 1.
# extract counts
ccc <- counts(DESeq2Data(importedData))
condition <- colData(DESeq2Data(importedData))$condition
y <- DGEList(ccc,
group = condition)
design <- model.matrix(~ 0 + condition)
colnames(design) <- levels(condition)
y <- estimateDisp(y, design = design)
plotBCV(y)
fit <- glmQLFit(y, design, robust=TRUE)
plotQLDisp(fit)
contrast <- makeContrasts(WT-mutant, levels=design)
res <- glmQLFTest(fit, contrast=contrast)
resEdgeR <- topTags(sort.by = "none", res, n=Inf)$table
FDRedgeR <- fdrtool(resEdgeR$PValue, statistic = "pvalue", plot = FALSE)
Step 1... determine cutoff point
Step 2... estimate parameters of null distribution and eta0
Step 3... compute p-values and estimate empirical PDF/CDF
Step 4... compute q-values and local fdr
resEdgeR$ldfr <- FDRedgeR$lfdr
table(resEdgeR$ldfr < 0.2)
FALSE TRUE
1157 1344
lfdrEdgeR <- FDRedgeR$lfdr[1000:2501]
sum(lfdrEdgeR < 0.2)
[1] 1127
countsLog2 <- as.data.frame(log2(counts(DESeq2Data(importedData), norm = TRUE)))
sampleTable <- colData(DESeq2Data(importedData))
pos <- seq(from =-unique(sampleTable$upstream)
, to = unique(sampleTable$downstream), by = 1)
suppressMessages(c.ggplot2 <- data.frame(pos = pos,
melt(countsLog2,
variable.name= "sample",
value.name= "PosSignal")))
### get mean per group for ratio plots
m.f <- function(x) { smhuber(x)$mu }
m.per.group <- t(aggregate.data.frame(t(countsLog2),
by = list(sampleTable$condition), FUN = m.f )[,-1])
colnames(m.per.group) <- levels(sampleTable$condition)
# introduce post hoc l2fc cutoff
idxSig <- resEdgeR$ldfr < 0.2 & (abs(resEdgeR$logFC / log(2)) > 0.05)
m.per.group <- as.data.frame(m.per.group)
m.per.group$pos <- pos
m.per.group$significant <- idxSig
dataGG <- gather(m.per.group,
key = "experimental_Group_and_significance",
value = "mean_log2_counts", 1:2)
dataGG$experimental_Group_and_significance <- ifelse(dataGG$significant,
"significant",
as.character(dataGG$experimental_Group_and_significance))
dataGG$experimental_Group_and_significance <- mapvalues(dataGG$experimental_Group_and_significance,
from = c("mutant", "WT", "significant"), to = c("Mutant", "Wild type", "Significant Difference"))
dataGG$experimental_Group_and_significance <- factor(dataGG$experimental_Group_and_significance)
pl <- (ggplot(data = dataGG,
aes_string(x = "pos",
y = "mean_log2_counts",
color = "experimental_Group_and_significance")) +
geom_point() +
labs(x ="Distance from TSS (bp)",
y="Normalized counts (log2)") +
scale_color_manual(values = c( "#d95f02", "#7570b3", "black"), name = "") +
theme(panel.background = element_blank(), panel.grid.minor=element_blank(),
axis.line = element_line(colour = "black", size = 0.5)))
pl
#svg(filename = 'K4me2_significance_edgeR.svg',height = 6, width = 15)
(pl + theme( axis.ticks.length = unit(10, "points"),
axis.line = element_line(size = 1),
axis.title = element_text(size = 16, face = "bold"),
axis.text = element_text(size = 12, face = "bold"),
plot.title = element_text(size = 16, face = "bold"),
legend.position = c(0.4, .7),
legend.key = element_rect(fill = "white"),
legend.text = element_text(size = 16, face = "bold")
)
+ ggtitle("Call for significantly enriched regions based on csaw/edgeR"))
#dev.off()
sessionInfo()
R version 3.2.2 (2015-08-14)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS release 6.5 (Final)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats4 parallel stats graphics grDevices utils datasets
[8] methods base
other attached packages:
[1] smoothmest_0.1-2 MASS_7.3-45
[3] reshape2_1.4.1 fdrtool_1.2.15
[5] xtable_1.8-0 edgeR_3.12.0
[7] limma_3.26.3 DESeq2_1.10.1
[9] RcppArmadillo_0.6.400.2.2 Rcpp_0.12.2
[11] csaw_1.4.1 SummarizedExperiment_1.0.2
[13] Biobase_2.30.0 GenomicRanges_1.22.2
[15] GenomeInfoDb_1.6.1 IRanges_2.4.6
[17] S4Vectors_0.8.5 BiocGenerics_0.16.1
[19] tidyr_0.3.1 biomaRt_2.26.1
[21] devtools_1.9.1 openxlsx_3.0.0
[23] pheatmap_1.0.8 stringr_1.0.0
[25] RColorBrewer_1.1-2 ggplot2_2.0.0
[27] DChIPRep_1.0.3 dplyr_0.4.3
[29] plyr_1.8.3 knitr_1.11
[31] BiocStyle_1.8.0
loaded via a namespace (and not attached):
[1] splines_3.2.2 Formula_1.2-1 assertthat_0.1
[4] statmod_1.4.23 latticeExtra_0.6-26 Rsamtools_1.22.0
[7] yaml_2.1.13 RSQLite_1.0.0 lattice_0.20-33
[10] digest_0.6.8 XVector_0.10.0 colorspace_1.2-6
[13] htmltools_0.3 XML_3.98-1.3 genefilter_1.52.0
[16] zlibbioc_1.16.0 scales_0.3.0 BiocParallel_1.4.3
[19] annotate_1.48.0 GenomicFeatures_1.22.6 lazyeval_0.1.10
[22] nnet_7.3-11 survival_2.38-3 magrittr_1.5
[25] memoise_0.2.1 evaluate_0.8 foreign_0.8-66
[28] tools_3.2.2 formatR_1.2.1 munsell_0.4.2
[31] locfit_1.5-9.1 cluster_2.0.3 AnnotationDbi_1.32.3
[34] lambda.r_1.1.7 Biostrings_2.38.3 futile.logger_1.4.1
[37] grid_3.2.2 RCurl_1.95-4.7 labeling_0.3
[40] bitops_1.0-6 rmarkdown_0.9.2 codetools_0.2-14
[43] gtable_0.1.2 DBI_0.3.1 R6_2.1.1
[46] GenomicAlignments_1.6.1 gridExtra_2.0.0 rtracklayer_1.30.1
[49] Hmisc_3.17-1 futile.options_1.0.0 stringi_1.0-1
[52] geneplotter_1.48.0 rpart_4.1-10 acepack_1.3-3.3