This file gives the answer to the document “Practical statistical analysis of RNA-Seq data” using the R packages DESeq2 (version 1.4.5), mixOmics (version 5.0-1), RColorBrewer(version 1.0-5) and HTSFilter (version 1.6.0).

library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit
## 
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: Rcpp
## Loading required package: RcppArmadillo
library(RColorBrewer)
library(mixOmics)
## Loading required package: MASS
## Loading required package: lattice
## 
## Attaching package: 'mixOmics'
## 
## The following object is masked from 'package:GenomicRanges':
## 
##     map
## 
## The following object is masked from 'package:IRanges':
## 
##     map
library(HTSFilter)
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## No methods found in "IRanges" for requests: mcols

3.1. Starting from count table

Properly set the directory from which files are imported:

directory <- "RNAseq_data/count_table_files/"
dir(directory)
## [1] "count_table.tsv"    "pasilla_design.txt"

Exercise 3.1 Read the files:

rawCountTable <- read.table(paste0(directory,"count_table.tsv"), header=TRUE,
                            row.names=1)
sampleInfo <- read.table(paste0(directory,"pasilla_design.txt"), header=TRUE,
                         row.names=1)

Exercise 3.2 Have a look at the count data:

head(rawCountTable)
##             untreated1 untreated2 untreated3 untreated4 treated1 treated2
## FBgn0000003          0          0          0          0        0        0
## FBgn0000008         92        161         76         70      140       88
## FBgn0000014          5          1          0          0        4        0
## FBgn0000015          0          2          1          2        1        0
## FBgn0000017       4664       8714       3564       3150     6205     3072
## FBgn0000018        583        761        245        310      722      299
##             treated3
## FBgn0000003        1
## FBgn0000008       70
## FBgn0000014        0
## FBgn0000015        0
## FBgn0000017     3334
## FBgn0000018      308
nrow(rawCountTable)
## [1] 14599

14599 genes are included in this file.

Exercise 3.3 Have a look at the sample information and order the count table in the same way that the sample information:

sampleInfo
##                   type number.of.lanes total.number.of.reads exon.counts
## treated1   single-read               5              35158667    15679615
## treated2    paired-end               2         12242535 (x2)    15620018
## treated3    paired-end               2         12443664 (x2)    12733865
## untreated1 single-read               2              17812866    14924838
## untreated2 single-read               6              34284521    20764558
## untreated3  paired-end               2         10542625 (x2)    10283129
## untreated4  paired-end               2         12214974 (x2)    11653031
rawCountTable <- rawCountTable[,match(rownames(sampleInfo),
                                      colnames(rawCountTable))]
head(rawCountTable)
##             treated1 treated2 treated3 untreated1 untreated2 untreated3
## FBgn0000003        0        0        1          0          0          0
## FBgn0000008      140       88       70         92        161         76
## FBgn0000014        4        0        0          5          1          0
## FBgn0000015        1        0        0          0          2          1
## FBgn0000017     6205     3072     3334       4664       8714       3564
## FBgn0000018      722      299      308        583        761        245
##             untreated4
## FBgn0000003          0
## FBgn0000008         70
## FBgn0000014          0
## FBgn0000015          2
## FBgn0000017       3150
## FBgn0000018        310

Exercise 3.4 Create the ‘condition’ column

sampleInfo$condition <- substr(rownames(sampleInfo), 1,
                               nchar(rownames(sampleInfo))-1)
sampleInfo$condition[sampleInfo$condition=="untreated"] <- "control"
sampleInfo$condition <- factor(sampleInfo$condition)
sampleInfo
##                   type number.of.lanes total.number.of.reads exon.counts
## treated1   single-read               5              35158667    15679615
## treated2    paired-end               2         12242535 (x2)    15620018
## treated3    paired-end               2         12443664 (x2)    12733865
## untreated1 single-read               2              17812866    14924838
## untreated2 single-read               6              34284521    20764558
## untreated3  paired-end               2         10542625 (x2)    10283129
## untreated4  paired-end               2         12214974 (x2)    11653031
##            condition
## treated1     treated
## treated2     treated
## treated3     treated
## untreated1   control
## untreated2   control
## untreated3   control
## untreated4   control

Exercise 3.5 Create a ‘DESeqDataSet’ object

ddsFull <- DESeqDataSetFromMatrix(as.matrix(rawCountTable), sampleInfo,
                                  formula(~condition))
ddsFull
## class: DESeqDataSet 
## dim: 14599 7 
## exptData(0):
## assays(1): counts
## rownames(14599): FBgn0000003 FBgn0000008 ... FBgn0261574
##   FBgn0261575
## rowData metadata column names(0):
## colnames(7): treated1 treated2 ... untreated3 untreated4
## colData names(5): type number.of.lanes total.number.of.reads
##   exon.counts condition

3.2. Starting from separate files

Exercise 3.6 List all files in directory

directory <- "RNAseq_data/separate_files/"
sampleFiles <- list.files(directory)
sampleFiles
## [1] "pasilla_design.txt" "treated1fb.txt"     "treated2fb.txt"    
## [4] "treated3fb.txt"     "untreated1fb.txt"   "untreated2fb.txt"  
## [7] "untreated3fb.txt"   "untreated4fb.txt"

Exercise 3.7 Create an object with file informations

keptFiles <- sampleFiles[-1]
sampleName <- sapply(keptFiles, function(afile)
  substr(afile, 1, nchar(afile)-6))
condition<- sapply(keptFiles, function(afile) 
  substr(afile, 1, nchar(afile)-7))
fileInfo <- data.frame(sampleName = sampleName, sampleFiles = keptFiles,
                       condition = condition)
rownames(fileInfo) <- NULL
fileInfo
##   sampleName      sampleFiles condition
## 1   treated1   treated1fb.txt   treated
## 2   treated2   treated2fb.txt   treated
## 3   treated3   treated3fb.txt   treated
## 4 untreated1 untreated1fb.txt untreated
## 5 untreated2 untreated2fb.txt untreated
## 6 untreated3 untreated3fb.txt untreated
## 7 untreated4 untreated4fb.txt untreated

Exercise 3.8 Construct a ‘DESeqDataSet’ object

ddsHTSeq <- DESeqDataSetFromHTSeqCount(fileInfo, directory, formula(~condition))
ddsHTSeq
## class: DESeqDataSet 
## dim: 70463 7 
## exptData(0):
## assays(1): counts
## rownames(70463): FBgn0000003:001 FBgn0000008:001 ...
##   FBgn0261575:001 FBgn0261575:002
## rowData metadata column names(0):
## colnames(7): treated1 treated2 ... untreated3 untreated4
## colData names(1): condition

3.3. Preparing the data object for the analysis of interest

Exercise 3.9 Select the subset of paire-end samples

dds <- subset(ddsFull, select=colData(ddsFull)$type=="paired-end")
dds
## class: DESeqDataSet 
## dim: 14599 4 
## exptData(0):
## assays(1): counts
## rownames(14599): FBgn0000003 FBgn0000008 ... FBgn0261574
##   FBgn0261575
## rowData metadata column names(0):
## colnames(4): treated2 treated3 untreated3 untreated4
## colData names(5): type number.of.lanes total.number.of.reads
##   exon.counts condition
colData(dds)
## DataFrame with 4 rows and 5 columns
##                  type number.of.lanes total.number.of.reads exon.counts
##              <factor>       <integer>              <factor>   <integer>
## treated2   paired-end               2         12242535 (x2)    15620018
## treated3   paired-end               2         12443664 (x2)    12733865
## untreated3 paired-end               2         10542625 (x2)    10283129
## untreated4 paired-end               2         12214974 (x2)    11653031
##            condition
##             <factor>
## treated2     treated
## treated3     treated
## untreated3   control
## untreated4   control

3.4 Data exploration and quality assessment

Exercise 3.10 Extract pseudo-counts (ie \(\log_2(K+1)\))

pseudoCounts <- log2(counts(dds)+1)
head(pseudoCounts)
##              treated2  treated3 untreated3 untreated4
## FBgn0000003  0.000000  1.000000   0.000000   0.000000
## FBgn0000008  6.475733  6.149747   6.266787   6.149747
## FBgn0000014  0.000000  0.000000   0.000000   0.000000
## FBgn0000015  0.000000  0.000000   1.000000   1.584963
## FBgn0000017 11.585432 11.703471  11.799686  11.621594
## FBgn0000018  8.228819  8.271463   7.942515   8.280771

Exercise 3.11 Histogram for pseudo-counts (sample treated2)

hist(pseudoCounts[,"treated2"])

Exercise 3.12 Boxplot for pseudo-counts

boxplot(pseudoCounts, col="gray")

Exercise 3.13 MA-plots between control or treated samples

par(mfrow=c(1,2))
## treated2 vs treated3
# A values
avalues <- (pseudoCounts[,1] + pseudoCounts[,2])/2
# M values
mvalues <- (pseudoCounts[,1] - pseudoCounts[,2])
plot(avalues, mvalues, xlab="A", ylab="M", pch=19, main="treated")
abline(h=0, col="red")
## untreated3 vs untreated4
# A values
avalues <- (pseudoCounts[,3] + pseudoCounts[,4])/2
# M values
mvalues <- (pseudoCounts[,3] - pseudoCounts[,4])
plot(avalues, mvalues, xlab="A", ylab="M", pch=19, main="control")
abline(h=0, col="red")

Exercise 3.14 PCA for pseudo-counts

vsd <- varianceStabilizingTransformation(dds)
vsd
## class: SummarizedExperiment 
## dim: 14599 4 
## exptData(0):
## assays(1): ''
## rownames(14599): FBgn0000003 FBgn0000008 ... FBgn0261574
##   FBgn0261575
## rowData metadata column names(6): baseMean baseVar ... dispFit
##   dispFit.1
## colnames(4): treated2 treated3 untreated3 untreated4
## colData names(6): type number.of.lanes ... condition sizeFactor
plotPCA(vsd)

Exercise 3.15 heatmap for pseudo-counts (using mixOmics package)

sampleDists <- as.matrix(dist(t(pseudoCounts)))
sampleDists
##            treated2 treated3 untreated3 untreated4
## treated2    0.00000 60.42701   75.96033   70.65302
## treated3   60.42701  0.00000   80.03024   71.71797
## untreated3 75.96033 80.03024    0.00000   65.97926
## untreated4 70.65302 71.71797   65.97926    0.00000
cimColor <- colorRampPalette(rev(brewer.pal(9, "Blues")))(16)
cim(sampleDists, col=cimColor, symkey=FALSE)

3.5. Differential expression analysis

Exercise 3.16 Run the DESeq2 analysis

dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds
## class: DESeqDataSet 
## dim: 14599 4 
## exptData(0):
## assays(3): counts mu cooks
## rownames(14599): FBgn0000003 FBgn0000008 ... FBgn0261574
##   FBgn0261575
## rowData metadata column names(28): baseMean baseVar ... deviance
##   maxCooks
## colnames(4): treated2 treated3 untreated3 untreated4
## colData names(6): type number.of.lanes ... condition sizeFactor

3.6. Inspecting the results

Exercise 3.17 Extract the results

res <- results(dds)
res
## log2 fold change (MAP): condition treated vs control 
## Wald test p-value: condition treated vs control 
## DataFrame with 14599 rows and 6 columns
##                 baseMean log2FoldChange      lfcSE       stat     pvalue
##                <numeric>      <numeric>  <numeric>  <numeric>  <numeric>
## FBgn0000003    0.2242980     0.01999754 0.03841923  0.5205085 0.60270918
## FBgn0000008   76.2956431    -0.04663587 0.21614576 -0.2157612 0.82917388
## FBgn0000014    0.0000000             NA         NA         NA         NA
## FBgn0000015    0.7810873    -0.07124212 0.07009119 -1.0164204 0.30942925
## FBgn0000017 3298.6821506    -0.24384591 0.12034584 -2.0262098 0.04274329
## ...                  ...            ...        ...        ...        ...
## FBgn0261571     0.000000             NA         NA         NA         NA
## FBgn0261572     5.272899    -0.19531772  0.1595563 -1.2241302  0.2209031
## FBgn0261573  1728.419843     0.04936833  0.1099932  0.4488310  0.6535536
## FBgn0261574  3129.036923    -0.03913704  0.1289667 -0.3034662  0.7615346
## FBgn0261575     2.659185     0.05090077  0.1244492  0.4090084  0.6825335
##                  padj
##             <numeric>
## FBgn0000003        NA
## FBgn0000008 0.9566162
## FBgn0000014        NA
## FBgn0000015        NA
## FBgn0000017 0.2353693
## ...               ...
## FBgn0261571        NA
## FBgn0261572        NA
## FBgn0261573 0.9027184
## FBgn0261574 0.9398855
## FBgn0261575        NA

Exercise 3.18 Obtain information on the meaning of the columns

mcols(res)
## DataFrame with 6 rows and 2 columns
##           type                                          description
##    <character>                                          <character>
## 1 intermediate            mean of normalized counts for all samples
## 2      results log2 fold change (MAP): condition treated vs control
## 3      results         standard error: condition treated vs control
## 4      results         Wald statistic: condition treated vs control
## 5      results      Wald test p-value: condition treated vs control
## 6      results                                 BH adjusted p-values

Exercise 3.19 Count the number of significant genes at level 1%

sum(res$padj < 0.01, na.rm=TRUE)
## [1] 509

Exercise 3.20 Extract significant genes and sort them by the strongest down regulation

sigDownReg <- res[!is.na(res$padj), ]
sigDownReg <- sigDownReg[sigDownReg$padj < 0.01, ]
sigDownReg <- sigDownReg[order(sigDownReg$log2FoldChange),]
sigDownReg
## log2 fold change (MAP): condition treated vs control 
## Wald test p-value: condition treated vs control 
## DataFrame with 509 rows and 6 columns
##               baseMean log2FoldChange      lfcSE      stat        pvalue
##              <numeric>      <numeric>  <numeric> <numeric>     <numeric>
## FBgn0039155   463.4369      -3.668239 0.16451365 -22.29748 3.909282e-110
## FBgn0039827   188.5927      -3.009043 0.19731233 -15.25015  1.642499e-52
## FBgn0003360  2544.2512      -2.795779 0.10893830 -25.66387 2.960163e-145
## FBgn0034736   162.0375      -2.565874 0.19789037 -12.96614  1.903771e-38
## FBgn0026562 36449.4713      -2.339838 0.09897541 -23.64060 1.474641e-123
## ...                ...            ...        ...       ...           ...
## FBgn0003501   232.3835       1.718844  0.1751360  9.814335  9.768012e-23
## FBgn0051092   128.8251       1.777316  0.1987698  8.941578  3.836522e-19
## FBgn0000071   302.3697       2.163293  0.1634766 13.233045  5.654891e-40
## FBgn0035189   197.4689       2.238500  0.1934138 11.573630  5.605945e-31
## FBgn0025111  1340.2282       2.722749  0.1171002 23.251458 1.375043e-119
##                      padj
##                 <numeric>
## FBgn0039155 7.362156e-107
## FBgn0039827  1.546618e-49
## FBgn0003360 2.229891e-141
## FBgn0034736  9.560737e-36
## FBgn0026562 5.554234e-120
## ...                   ...
## FBgn0003501  2.452748e-20
## FBgn0051092  7.225130e-17
## FBgn0000071  3.276792e-37
## FBgn0035189  2.346088e-28
## FBgn0025111 3.452733e-116

Exercise 3.21 Extract significant genes and sort them by the strongest up regulation

sigUpReg <- res[!is.na(res$padj), ]
sigUpReg <- sigUpReg[sigUpReg$padj < 0.01, ]
sigUpReg <- sigUpReg[order(sigUpReg$log2FoldChange, decreasing=TRUE),]
sigUpReg
## log2 fold change (MAP): condition treated vs control 
## Wald test p-value: condition treated vs control 
## DataFrame with 509 rows and 6 columns
##               baseMean log2FoldChange      lfcSE      stat        pvalue
##              <numeric>      <numeric>  <numeric> <numeric>     <numeric>
## FBgn0025111  1340.2282       2.722749  0.1171002 23.251458 1.375043e-119
## FBgn0035189   197.4689       2.238500  0.1934138 11.573630  5.605945e-31
## FBgn0000071   302.3697       2.163293  0.1634766 13.233045  5.654891e-40
## FBgn0051092   128.8251       1.777316  0.1987698  8.941578  3.836522e-19
## FBgn0003501   232.3835       1.718844  0.1751360  9.814335  9.768012e-23
## ...                ...            ...        ...       ...           ...
## FBgn0026562 36449.4713      -2.339838 0.09897541 -23.64060 1.474641e-123
## FBgn0034736   162.0375      -2.565874 0.19789037 -12.96614  1.903771e-38
## FBgn0003360  2544.2512      -2.795779 0.10893830 -25.66387 2.960163e-145
## FBgn0039827   188.5927      -3.009043 0.19731233 -15.25015  1.642499e-52
## FBgn0039155   463.4369      -3.668239 0.16451365 -22.29748 3.909282e-110
##                      padj
##                 <numeric>
## FBgn0025111 3.452733e-116
## FBgn0035189  2.346088e-28
## FBgn0000071  3.276792e-37
## FBgn0051092  7.225130e-17
## FBgn0003501  2.452748e-20
## ...                   ...
## FBgn0026562 5.554234e-120
## FBgn0034736  9.560737e-36
## FBgn0003360 2.229891e-141
## FBgn0039827  1.546618e-49
## FBgn0039155 7.362156e-107

Exercise 3.22 Create permanent storage of results

write.csv(sigDownReg, file="sigDownReg-deseq.csv")
write.csv(sigUpReg, file="sigUpReg-deseq.csv")

3.7 Diagnostic plot for multiple testing

Exercise 3.23 Plot a histogram of unadjusted p-values after filtering

hist(res$pvalue, breaks=50)

3.8 Interpreting the DE analysis results

Exercise 3.24 Create a MA plot showing differentially expressed genes

plotMA(res, alpha=0.01)

Exercise 3.25 Create a Volcano plot

volcanoData <- cbind(res$log2FoldChange, -log10(res$padj))
volcanoData <- na.omit(volcanoData)
colnames(volcanoData) <- c("logFC", "negLogPval")
head(volcanoData)
##            logFC   negLogPval
## [1,] -0.04663587  0.019262290
## [2,] -0.24384591  0.628250140
## [3,] -0.04161106  0.024127703
## [4,] -0.00870916  0.006284238
## [5,]  0.48382518  5.875357944
## [6,]  0.72477623 13.492690129
plot(volcanoData, pch=19, cex=0.5)

Exercise 3.26 Transform the normalized counts for variance stabilization

vsnd <- varianceStabilizingTransformation(dds, blind=FALSE)
vsnd
## class: SummarizedExperiment 
## dim: 14599 4 
## exptData(0):
## assays(1): ''
## rownames(14599): FBgn0000003 FBgn0000008 ... FBgn0261574
##   FBgn0261575
## rowData metadata column names(28): baseMean baseVar ... deviance
##   maxCooks
## colnames(4): treated2 treated3 untreated3 untreated4
## colData names(6): type number.of.lanes ... condition sizeFactor

Exercise 3.27 Extract the transformed data

head(assay(vsnd), 10)
##              treated2  treated3 untreated3 untreated4
## FBgn0000003  7.467258  7.569963   7.467258   7.467258
## FBgn0000008  8.454053  8.314503   8.459439   8.355754
## FBgn0000014  7.467258  7.467258   7.467258   7.467258
## FBgn0000015  7.467258  7.467258   7.583294   7.619755
## FBgn0000017 11.709561 11.703802  12.112722  11.757305
## FBgn0000018  9.213338  9.169608   9.181377   9.250923
## FBgn0000022  7.467258  7.467258   7.467258   7.467258
## FBgn0000024  7.750574  7.696720   7.668130   7.653985
## FBgn0000028  7.574489  7.569963   7.467258   7.467258
## FBgn0000032  9.970716  9.968425  10.005398   9.947107
selY <- assay(vsnd)[!is.na(res$pval), ]
selY <- selY[res$pval[!is.na(res$pval)] < 0.01,]
cimColor <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)[255:1]
cim(t(selY), col=cimColor, symkey=FALSE)