############################################## ## Load packages ############################################## library(DESeq2) library(coseq) library(corrplot) ############################################## ## Read in Drosophila data and phenotypic info ############################################## #Option 1: (see bio.sourceforge est accessible) #droso_full <- read.table(header = TRUE, # "http://bowtie-bio.sourceforge.net/recount/pooled/modencodefly_pooledreps_count_table.txt") #conds_full <- read.table(header = TRUE, # "http://bowtie-bio.sourceforge.net/recount/pooled/modencodefly_pooled_phenodata.txt") #conds_full <- conds_full$stage #rownames(droso_full) <- droso_full$gene # Assign gene IDs to rownames #droso_full <- droso_full[,-1] ## Option 2: library(Biobase) load("modencode_fly_pooled.RData") droso_full <- exprs(modencodefly.eset.pooled) conds_full <- pData(modencodefly.eset.pooled)$stage ## Subset data to only include embryonic samples, ## remove genes with all 0's keep_index <- which(substr(conds_full, 1, 6) == "Embryo") droso <- droso_full[, keep_index] conds <- droplevels(conds_full[keep_index]) droso <- droso[which(rowSums(droso) > 0),] dim(droso) ## 13164 x 12 head(droso) conds ############################################## ## DESeq2 analysis: time as continuous effect ## LRT to compare full and reduced models ############################################## colData <- data.frame(time_group=conds, time=seq(1,23,by=2)) dds <- DESeqDataSetFromMatrix(countData = droso, colData = colData, design = ~ time) dds <- DESeq(dds, test="LRT", reduced = ~1) dds_res <- results(dds) summary(dds_res) hist(dds_res$pvalue) ## Plot examples of NDE and DE genes (use alpha = 0.05) par(mfrow=c(1,2)) set.seed(12345) alpha <- 0.05 matplot(t(droso[sample(which(dds_res$padj > alpha),10),]+1), type="l", lty=1, col="black", main="NDE across time", ylab="Log expression", xlab="Embryonic stage", log="y") matplot(t(droso[sample(which(dds_res$padj < alpha),10),]+1), type="l", lty=1, col="black", main="DE across time", ylab="Log expression", xlab="Embryonic stage", log="y") ## Subset data for clustering analysis: ## only DE genes and genes with mean normalized count > 50 droso_counts <- droso[which(dds_res$padj < alpha & rowMeans(counts(dds, norm=TRUE)>50)),] dim(droso_counts) ## Consider expression profiles for clustering droso_profiles <- transform_RNAseq(droso_counts, norm=sizeFactors(dds), transformation="profile")$tcounts head(round(droso_profiles,3)) ############################################## ## K-means ############################################## ## K-means on profiles K_choice <- 2:50 km <- vector("list", length(K_choice)) names(km) <- paste("K=", K_choice, sep="") for(K in K_choice) { cat("K =", K, "...\n") km[[paste("K=",K,sep="")]] <- kmeans(droso_profiles, centers=K, nstart=5) } ## Number of clusters unclear here: no elbow! withinss <- unlist(lapply(km, function(x) x$tot.withinss)) betweenss <- unlist(lapply(km, function(x) x$betweenss)) totss <- unlist(lapply(km, function(x) x$totss)) par(mfrow=c(1,2), mar=c(4,4,1,1)) plot(K_choice, withinss, type="b", xlab="Number of Clusters", ylab="Within-group SS") plot(K_choice, betweenss/totss, type="b", xlab="Number of Clusters", ylab="Between-group / Total SS") ## (Arbitrarily) choose model with K=20 clusters km_final <- km[["K=20"]] km_labels <- km_final$cluster km_centers <- km_final$centers km_size <- km_final$size cat(" Total SS:\n ", round(km_final$totss,2), "\n", "Within SS:\n ", round(km_final$withinss,2), "\n", "Total within SS:\n ", round(km_final$tot.withinss,2), "\n", "Between SS:\n ", round(km_final$betweenss,2), "\n") ## Plot cluster centers matplot(t(km_centers), type="l", col="black", lty=1, lwd=2, ylab="Kmeans cluster centers", xlab="Embryonic stage") ############################################## ## Hierarchical clustering ############################################## droso_dist <- dist(droso_profiles, method="euclidean") hc <- hclust(droso_dist) plot(hc, labels=FALSE, xlab = "Euclidean distance, profiles") abline(h=0.52, lty=2, col="red") ## Cut the tree for K=20 clusters c <- cutree(hc, h=0.52) table(c) hclust_labels <- c hclust_size <- table(c) ############################################## ## coseq: Gaussian mixture model w/ transformation ############################################## ## Takes about 2 min to run on 6 cores of Windows laptop ## Note: iter and init.runs artificially low for example run time!! set.seed(12345) start <- proc.time()[3] coseq_norm <- coseq(droso_counts, K=2:40, model="Normal", transformation="arcsin", norm=sizeFactors(dds), init.runs=3, iter=25, parallel=TRUE) cat("Run time:", (proc.time()[3] - start)/60, "minutes\n") ## Examine results summary(coseq_norm) plot(coseq_norm) ## Reproduce specific graphs, if desired plot(coseq_norm, order=TRUE, graphs=c("probapost_boxplots", "probapost_barplots")) plot(coseq_norm, profiles_order=TRUE, graphs=c("boxplots")) coseq_norm_labels <- apply(coseq_norm$results$ICL.results$probaPost, 1, which.max) ## Look at a few per-matrix correlation plots param <- NormMixParam(coseq_norm) par(mfrow=c(2,3)) for(k in 1:6) { tmp <- param$rho[,,k] colnames(tmp) <- rownames(tmp) <- as.character(conds) corrplot(tmp, main="") text(-2,15,paste("Cluster", k), font=2) } ############################################## ## Compare clustering results ############################################## ## Compare K-means, hclust, coseq_norm clusterings all_labels <-data.frame(coseq_norm=coseq_norm_labels, hclust=hclust_labels, Kmeans=km_labels) compareARI(all_labels) ## Cross-tabulate of clusterings from coseq Norm and K-means matchContTable(coseq_norm_labels, km_labels)