This file gives the answer to the document “Practical statistical analysis of RNA-Seq data” using the R packages edgeR (version 3.6.8), limma (version 3.20.7), mixOmics (version 5.0-1), RColorBrewer(version 1.0-5) and HTSFilter (version 1.6.0).
library(edgeR)
## Loading required package: limma
library(limma)
library(RColorBrewer)
library(mixOmics)
## Loading required package: MASS
## Loading required package: lattice
library(HTSFilter)
## Loading required package: Biobase
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
##
## The following object is masked from 'package:limma':
##
## plotMA
##
## The following object is masked from 'package:stats':
##
## xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, as.vector, cbind,
## colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
## intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unlist
##
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
Properly set the directory from which files are imported:
directory <- "RNAseq_data/count_table_files/"
dir(directory)
## [1] "count_table.tsv" "pasilla_design.txt"
Exercise 3.1 Read the files:
rawCountTable <- read.table(paste0(directory,"count_table.tsv"), header=TRUE,
row.names=1)
sampleInfo <- read.table(paste0(directory,"pasilla_design.txt"), header=TRUE,
row.names=1)
Exercise 3.2 Have a look at the count data:
head(rawCountTable)
## untreated1 untreated2 untreated3 untreated4 treated1 treated2
## FBgn0000003 0 0 0 0 0 0
## FBgn0000008 92 161 76 70 140 88
## FBgn0000014 5 1 0 0 4 0
## FBgn0000015 0 2 1 2 1 0
## FBgn0000017 4664 8714 3564 3150 6205 3072
## FBgn0000018 583 761 245 310 722 299
## treated3
## FBgn0000003 1
## FBgn0000008 70
## FBgn0000014 0
## FBgn0000015 0
## FBgn0000017 3334
## FBgn0000018 308
nrow(rawCountTable)
## [1] 14599
14599 genes are included in this file.
Exercise 3.3 Have a look at the sample information and order the count table in the same way that the sample information:
sampleInfo
## type number.of.lanes total.number.of.reads exon.counts
## treated1 single-read 5 35158667 15679615
## treated2 paired-end 2 12242535 (x2) 15620018
## treated3 paired-end 2 12443664 (x2) 12733865
## untreated1 single-read 2 17812866 14924838
## untreated2 single-read 6 34284521 20764558
## untreated3 paired-end 2 10542625 (x2) 10283129
## untreated4 paired-end 2 12214974 (x2) 11653031
rawCountTable <- rawCountTable[,match(rownames(sampleInfo),
colnames(rawCountTable))]
head(rawCountTable)
## treated1 treated2 treated3 untreated1 untreated2 untreated3
## FBgn0000003 0 0 1 0 0 0
## FBgn0000008 140 88 70 92 161 76
## FBgn0000014 4 0 0 5 1 0
## FBgn0000015 1 0 0 0 2 1
## FBgn0000017 6205 3072 3334 4664 8714 3564
## FBgn0000018 722 299 308 583 761 245
## untreated4
## FBgn0000003 0
## FBgn0000008 70
## FBgn0000014 0
## FBgn0000015 2
## FBgn0000017 3150
## FBgn0000018 310
Exercise 3.4 Create the ‘condition’ column
sampleInfo$condition <- substr(rownames(sampleInfo), 1,
nchar(rownames(sampleInfo))-1)
sampleInfo$condition[sampleInfo$condition=="untreated"] <- "control"
sampleInfo
## type number.of.lanes total.number.of.reads exon.counts
## treated1 single-read 5 35158667 15679615
## treated2 paired-end 2 12242535 (x2) 15620018
## treated3 paired-end 2 12443664 (x2) 12733865
## untreated1 single-read 2 17812866 14924838
## untreated2 single-read 6 34284521 20764558
## untreated3 paired-end 2 10542625 (x2) 10283129
## untreated4 paired-end 2 12214974 (x2) 11653031
## condition
## treated1 treated
## treated2 treated
## treated3 treated
## untreated1 control
## untreated2 control
## untreated3 control
## untreated4 control
Exercise 4.1 Create a DGEList data object
dgeFull <- DGEList(rawCountTable, group=sampleInfo$condition)
dgeFull
## An object of class "DGEList"
## $counts
## treated1 treated2 treated3 untreated1 untreated2 untreated3
## FBgn0000003 0 0 1 0 0 0
## FBgn0000008 140 88 70 92 161 76
## FBgn0000014 4 0 0 5 1 0
## FBgn0000015 1 0 0 0 2 1
## FBgn0000017 6205 3072 3334 4664 8714 3564
## untreated4
## FBgn0000003 0
## FBgn0000008 70
## FBgn0000014 0
## FBgn0000015 2
## FBgn0000017 3150
## 14594 more rows ...
##
## $samples
## group lib.size norm.factors
## treated1 treated 18670279 1
## treated2 treated 9571826 1
## treated3 treated 10343856 1
## untreated1 control 13972512 1
## untreated2 control 21911438 1
## untreated3 control 8358426 1
## untreated4 control 9841335 1
Exercise 4.2 Add the sample information object in the DGEList data
dgeFull$sampleInfo <- sampleInfo
dgeFull
## An object of class "DGEList"
## $counts
## treated1 treated2 treated3 untreated1 untreated2 untreated3
## FBgn0000003 0 0 1 0 0 0
## FBgn0000008 140 88 70 92 161 76
## FBgn0000014 4 0 0 5 1 0
## FBgn0000015 1 0 0 0 2 1
## FBgn0000017 6205 3072 3334 4664 8714 3564
## untreated4
## FBgn0000003 0
## FBgn0000008 70
## FBgn0000014 0
## FBgn0000015 2
## FBgn0000017 3150
## 14594 more rows ...
##
## $samples
## group lib.size norm.factors
## treated1 treated 18670279 1
## treated2 treated 9571826 1
## treated3 treated 10343856 1
## untreated1 control 13972512 1
## untreated2 control 21911438 1
## untreated3 control 8358426 1
## untreated4 control 9841335 1
##
## $sampleInfo
## type number.of.lanes total.number.of.reads exon.counts
## treated1 single-read 5 35158667 15679615
## treated2 paired-end 2 12242535 (x2) 15620018
## treated3 paired-end 2 12443664 (x2) 12733865
## untreated1 single-read 2 17812866 14924838
## untreated2 single-read 6 34284521 20764558
## untreated3 paired-end 2 10542625 (x2) 10283129
## untreated4 paired-end 2 12214974 (x2) 11653031
## condition
## treated1 treated
## treated2 treated
## treated3 treated
## untreated1 control
## untreated2 control
## untreated3 control
## untreated4 control
directory <- "RNAseq_data/separate_files/"
dir(directory)
## [1] "pasilla_design.txt" "treated1fb.txt" "treated2fb.txt"
## [4] "treated3fb.txt" "untreated1fb.txt" "untreated2fb.txt"
## [7] "untreated3fb.txt" "untreated4fb.txt"
Exercise 4.3 Reading the passilla design
fileInfo <- read.table(paste0(directory, "pasilla_design.txt"), header=TRUE)
fileInfo
## files type number.of.lanes total.number.of.reads
## 1 treated1fb.txt single-read 5 35158667
## 2 treated2fb.txt paired-end 2 12242535 (x2)
## 3 treated3fb.txt paired-end 2 12443664 (x2)
## 4 untreated1fb.txt single-read 2 17812866
## 5 untreated2fb.txt single-read 6 34284521
## 6 untreated3fb.txt paired-end 2 10542625 (x2)
## 7 untreated4fb.txt paired-end 2 12214974 (x2)
## exon.counts
## 1 15679615
## 2 15620018
## 3 12733865
## 4 14924838
## 5 20764558
## 6 10283129
## 7 11653031
Exercise 4.4 Create an additional column for the groups:
fileInfo$group <- substr(rownames(fileInfo), 1, nchar(rownames(fileInfo))-7)
fileInfo$group[fileInfo$group=="untreated"] <- "control"
fileInfo
## files type number.of.lanes total.number.of.reads
## 1 treated1fb.txt single-read 5 35158667
## 2 treated2fb.txt paired-end 2 12242535 (x2)
## 3 treated3fb.txt paired-end 2 12443664 (x2)
## 4 untreated1fb.txt single-read 2 17812866
## 5 untreated2fb.txt single-read 6 34284521
## 6 untreated3fb.txt paired-end 2 10542625 (x2)
## 7 untreated4fb.txt paired-end 2 12214974 (x2)
## exon.counts group
## 1 15679615
## 2 15620018
## 3 12733865
## 4 14924838
## 5 20764558
## 6 10283129
## 7 11653031
Exercise 4.5 Import data from separate files with readDGE:
dgeHTSeq <- readDGE(fileInfo, path=directory)
dgeHTSeq
## An object of class "DGEList"
## $samples
## files type number.of.lanes total.number.of.reads
## 1 treated1fb.txt single-read 5 35158667
## 2 treated2fb.txt paired-end 2 12242535 (x2)
## 3 treated3fb.txt paired-end 2 12443664 (x2)
## 4 untreated1fb.txt single-read 2 17812866
## 5 untreated2fb.txt single-read 6 34284521
## 6 untreated3fb.txt paired-end 2 10542625 (x2)
## 7 untreated4fb.txt paired-end 2 12214974 (x2)
## exon.counts group lib.size norm.factors
## 1 15679615 88834542 1
## 2 15620018 22381538 1
## 3 12733865 22573930 1
## 4 14924838 37094861 1
## 5 20764558 66650218 1
## 6 10283129 19318565 1
## 7 11653031 20196315 1
##
## $counts
## 1 2 3 4 5 6 7
## FBgn0000008:001 0 0 0 0 0 0 0
## FBgn0000008:002 0 0 0 0 0 1 0
## FBgn0000008:003 0 1 0 1 1 1 0
## FBgn0000008:004 1 0 1 0 1 0 1
## FBgn0000008:005 4 1 1 2 2 0 1
## 70461 more rows ...
Exercise 4.6 Select the subset paired-end samples from degFull
dge <- DGEList(dgeFull$counts[,dgeFull$sampleInfo$type=="paired-end"],
group=dgeFull$sampleInfo$condition[
dgeFull$sampleInfo$type=="paired-end"])
dge$sampleInfo <- dgeFull$sampleInfo[dgeFull$sampleInfo$type=="paired-end",]
Exercise 4.7 Extract pseudo-counts (ie \(\log_2(K+1)\))
pseudoCounts <- log2(dge$counts+1)
head(pseudoCounts)
## treated2 treated3 untreated3 untreated4
## FBgn0000003 0.000 1.000 0.000 0.000
## FBgn0000008 6.476 6.150 6.267 6.150
## FBgn0000014 0.000 0.000 0.000 0.000
## FBgn0000015 0.000 0.000 1.000 1.585
## FBgn0000017 11.585 11.703 11.800 11.622
## FBgn0000018 8.229 8.271 7.943 8.281
Exercise 4.8 Histogram for pseudo-counts (sample treated2
)
hist(pseudoCounts[,"treated2"])
Exercise 4.9 Boxplot for pseudo-counts
boxplot(pseudoCounts, col="gray")
Exercise 4.10 MA-plots between control or treated samples
par(mfrow=c(1,2))
## treated2 vs treated3
# A values
avalues <- (pseudoCounts[,1] + pseudoCounts[,2])/2
# M values
mvalues <- (pseudoCounts[,1] - pseudoCounts[,2])
plot(avalues, mvalues, xlab="A", ylab="M", pch=19, main="treated")
abline(h=0, col="red")
## untreated3 vs untreated4
# A values
avalues <- (pseudoCounts[,3] + pseudoCounts[,4])/2
# M values
mvalues <- (pseudoCounts[,3] - pseudoCounts[,4])
plot(avalues, mvalues, xlab="A", ylab="M", pch=19, main="control")
abline(h=0, col="red")
Exercise 4.11 MDS for pseudo-counts (using limma
package)
plotMDS(pseudoCounts)
Exercise 4.12 heatmap for pseudo-counts (using mixOmics
package)
sampleDists <- as.matrix(dist(t(pseudoCounts)))
sampleDists
## treated2 treated3 untreated3 untreated4
## treated2 0.00 60.43 75.96 70.65
## treated3 60.43 0.00 80.03 71.72
## untreated3 75.96 80.03 0.00 65.98
## untreated4 70.65 71.72 65.98 0.00
cimColor <- colorRampPalette(rev(brewer.pal(9, "Blues")))(16)
cim(sampleDists, col=cimColor, symkey=FALSE)
Exercise 4.13 remove genes with zero counts for all samples
dge <- DGEList(dge$counts[apply(dge$counts, 1, sum) != 0, ],
group=dge$sampleInfo$condition)
dge$sampleInfo <- dge$sampleInfo
head(dge$counts)
## treated2 treated3 untreated3 untreated4
## FBgn0000003 0 1 0 0
## FBgn0000008 88 70 76 70
## FBgn0000015 0 0 1 2
## FBgn0000017 3072 3334 3564 3150
## FBgn0000018 299 308 245 310
## FBgn0000024 7 5 3 3
Exercise 4.14 estimate the normalization factors
dge <- calcNormFactors(dge)
dge$samples
## group lib.size norm.factors
## treated2 treated 9571826 1.0081
## treated3 treated 10343856 1.0179
## untreated3 control 8358426 1.0041
## untreated4 control 9841335 0.9706
Exercise 4.15 estimate common and tagwise dispersion
dge <- estimateCommonDisp(dge)
dge <- estimateTagwiseDisp(dge)
dge
## An object of class "DGEList"
## $counts
## treated2 treated3 untreated3 untreated4
## FBgn0000003 0 1 0 0
## FBgn0000008 88 70 76 70
## FBgn0000015 0 0 1 2
## FBgn0000017 3072 3334 3564 3150
## FBgn0000018 299 308 245 310
## 11496 more rows ...
##
## $samples
## group lib.size norm.factors
## treated2 treated 9571826 1.0081
## treated3 treated 10343856 1.0179
## untreated3 control 8358426 1.0041
## untreated4 control 9841335 0.9706
##
## $common.dispersion
## [1] 0.004761
##
## $pseudo.counts
## treated2 treated3 untreated3 untreated4
## FBgn0000003 0.000e+00 9.154e-01 2.776e-17 2.776e-17
## FBgn0000008 8.671e+01 6.273e+01 8.564e+01 6.960e+01
## FBgn0000015 2.776e-17 2.776e-17 1.167e+00 1.990e+00
## FBgn0000017 3.025e+03 3.008e+03 4.032e+03 3.133e+03
## FBgn0000018 2.944e+02 2.777e+02 2.777e+02 3.083e+02
## 11496 more rows ...
##
## $pseudo.lib.size
## [1] 9499789
##
## $AveLogCPM
## [1] -2.083 3.036 -1.793 8.440 4.940
## 11496 more elements ...
##
## $prior.n
## [1] 5
##
## $tagwise.dispersion
## [1] 7.439e-05 8.684e-03 7.439e-05 6.891e-03 3.601e-03
## 11496 more elements ...
Exercise 4.16 perform an exact test for the difference in expression between the two conditions “treated” and “control”
dgeTest <- exactTest(dge)
dgeTest
## An object of class "DGEExact"
## $table
## logFC logCPM PValue
## FBgn0000003 2.25662 -2.083 1.00000
## FBgn0000008 -0.05492 3.036 0.82398
## FBgn0000015 -3.78099 -1.793 0.25001
## FBgn0000017 -0.24803 8.440 0.04289
## FBgn0000018 -0.03655 4.940 0.78928
## 11496 more rows ...
##
## $comparison
## [1] "control" "treated"
##
## $genes
## NULL
Exercise 4.17 remove low expressed genes
filtData <- HTSFilter(dge)$filteredData
dgeTestFilt <- exactTest(filtData)
dgeTestFilt
## An object of class "DGEExact"
## $table
## logFC logCPM PValue
## FBgn0000008 -0.054919 3.036 8.240e-01
## FBgn0000017 -0.248031 8.440 4.289e-02
## FBgn0000018 -0.036550 4.940 7.893e-01
## FBgn0000032 0.004668 6.172 9.729e-01
## FBgn0000042 0.516376 12.711 6.202e-08
## 6614 more rows ...
##
## $comparison
## [1] "control" "treated"
##
## $genes
## NULL
Exercise 4.18 plot a histogram of unadjusted p-values
hist(dgeTest$table[,"PValue"], breaks=50)
Exercise 4.19 plot a histogram of unadjusted p-values after filtering
hist(dgeTestFilt$table[,"PValue"], breaks=50)
Exercise 4.20 extract a summary of the differential expression statistics
resNoFilt <- topTags(dgeTest, n=nrow(dgeTest$table))
head(resNoFilt)
## Comparison of groups: treated-control
## logFC logCPM PValue FDR
## FBgn0039155 -4.378 5.588 4.293e-184 4.937e-180
## FBgn0025111 2.943 7.159 2.758e-152 1.586e-148
## FBgn0003360 -2.961 8.059 1.939e-151 7.432e-148
## FBgn0039827 -4.129 4.281 5.594e-104 1.608e-100
## FBgn0026562 -2.447 11.903 2.260e-102 5.198e-99
## FBgn0035085 -2.499 5.542 1.241e-96 2.379e-93
resFilt <- topTags(dgeTestFilt, n=nrow(dgeTest$table))
head(resFilt)
## Comparison of groups: treated-control
## logFC logCPM PValue FDR
## FBgn0039155 -4.378 5.588 4.293e-184 2.841e-180
## FBgn0025111 2.943 7.159 2.758e-152 9.128e-149
## FBgn0003360 -2.961 8.059 1.939e-151 4.277e-148
## FBgn0039827 -4.129 4.281 5.594e-104 9.257e-101
## FBgn0026562 -2.447 11.903 2.260e-102 2.992e-99
## FBgn0035085 -2.499 5.542 1.241e-96 1.369e-93
Exercise 4.21 compare the number of differentially expressed genes with and without filtering
# before independent filtering
sum(resNoFilt$table$FDR < 0.05)
## [1] 1347
# after independent filtering
sum(resFilt$table$FDR < 0.05)
## [1] 1363
Exercise 4.22 extract and sort differentially expressed genes
sigDownReg <- resFilt$table[resFilt$table$FDR<0.05,]
sigDownReg <- sigDownReg[order(sigDownReg$logFC),]
head(sigDownReg)
## logFC logCPM PValue FDR
## FBgn0085359 -5.153 1.966 2.843e-26 3.764e-24
## FBgn0039155 -4.378 5.588 4.293e-184 2.841e-180
## FBgn0024288 -4.208 2.161 1.359e-33 2.645e-31
## FBgn0039827 -4.129 4.281 5.594e-104 9.257e-101
## FBgn0034434 -3.824 3.107 1.732e-52 7.644e-50
## FBgn0034736 -3.482 4.060 5.794e-68 3.196e-65
sigUpReg <- sigDownReg[order(sigDownReg$logFC, decreasing=TRUE),]
head(sigUpReg)
## logFC logCPM PValue FDR
## FBgn0033764 3.268 2.612 3.224e-29 4.743e-27
## FBgn0035189 2.973 4.427 7.154e-48 2.492e-45
## FBgn0025111 2.943 7.159 2.758e-152 9.128e-149
## FBgn0037290 2.935 2.523 1.192e-25 1.461e-23
## FBgn0038198 2.670 2.587 4.163e-19 3.167e-17
## FBgn0000071 2.565 5.034 1.711e-78 1.416e-75
Exercise 4.24 write the results in csv files
write.csv(sigDownReg, file="sigDownReg.csv")
write.csv(sigUpReg, file="sigUpReg.csv")
Exercise 4.25 create a MA plot with 1% differentially expressed genes
plotSmear(dgeTestFilt,
de.tags = rownames(resFilt$table)[which(resFilt$table$FDR<0.01)])
Exercise 4.26 create a Volcano plot
volcanoData <- cbind(resFilt$table$logFC, -log10(resFilt$table$FDR))
colnames(volcanoData) <- c("logFC", "negLogPval")
head(volcanoData)
## logFC negLogPval
## [1,] -4.378 179.55
## [2,] 2.943 148.04
## [3,] -2.961 147.37
## [4,] -4.129 100.03
## [5,] -2.447 98.52
## [6,] -2.499 92.86
plot(volcanoData, pch=19)
Exercise 4.27 transform the normalized counts in log-counts-per-million
y <- cpm(dge, log=TRUE, prior.count = 1)
head(y)
## treated2 treated3 untreated3 untreated4
## FBgn0000003 -3.2526 -2.3226 -3.253 -3.253
## FBgn0000008 3.2056 2.7556 3.195 2.894
## FBgn0000015 -3.2526 -3.2526 -2.158 -1.670
## FBgn0000017 8.3151 8.3073 8.730 8.366
## FBgn0000018 4.9585 4.8757 4.873 5.025
## FBgn0000024 -0.2681 -0.7863 -1.113 -1.255
Exercise 4.28 select 1% differentially expressed genes and produce a heatmap
selY <- y[rownames(resFilt$table)[resFilt$table$FDR<0.01 &
abs(resFilt$table$logFC)>1.5],]
head(selY)
## treated2 treated3 untreated3 untreated4
## FBgn0039155 2.0155 2.335 6.466 6.608
## FBgn0025111 7.9476 7.996 5.059 5.006
## FBgn0003360 5.9800 5.878 8.802 8.970
## FBgn0039827 0.6377 1.516 5.252 5.212
## FBgn0026562 10.1798 10.247 12.544 12.769
## FBgn0035085 3.8172 3.843 6.279 6.363
cimColor <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)[255:1]
cim(t(selY), col=cimColor, symkey=FALSE)