##############################################
## Load packages
##############################################
library(DESeq2)
library(coseq)
library(corrplot)


##############################################
## Read in Drosophila data and phenotypic info
##############################################
#Option 1: (see bio.sourceforge est accessible)
#droso_full <- read.table(header = TRUE,
#       "http://bowtie-bio.sourceforge.net/recount/pooled/modencodefly_pooledreps_count_table.txt")
#conds_full <- read.table(header = TRUE,
#                    "http://bowtie-bio.sourceforge.net/recount/pooled/modencodefly_pooled_phenodata.txt")
#conds_full <- conds_full$stage
#rownames(droso_full) <- droso_full$gene # Assign gene IDs to rownames
#droso_full <- droso_full[,-1]

## Option 2:
library(Biobase)
load("modencode_fly_pooled.RData")
droso_full <- exprs(modencodefly.eset.pooled)
conds_full <- pData(modencodefly.eset.pooled)$stage




## Subset data to only include embryonic samples, 
## remove genes with all 0's
keep_index <- which(substr(conds_full, 1, 6) == "Embryo")
droso <- droso_full[, keep_index]
conds <- droplevels(conds_full[keep_index])
droso <- droso[which(rowSums(droso) > 0),]
dim(droso) ## 13164 x 12

head(droso)
conds


##############################################
## DESeq2 analysis: time as continuous effect
## LRT to compare full and reduced models
##############################################
colData <- data.frame(time_group=conds,
                      time=seq(1,23,by=2))
dds <- DESeqDataSetFromMatrix(countData = droso, 
                              colData = colData,
                              design = ~ time)
dds <- DESeq(dds, test="LRT", reduced = ~1)
dds_res <- results(dds)
summary(dds_res)
hist(dds_res$pvalue)

## Plot examples of NDE and DE genes (use alpha = 0.05)
par(mfrow=c(1,2))
set.seed(12345)
alpha <- 0.05
matplot(t(droso[sample(which(dds_res$padj > alpha),10),]+1), type="l", 
        lty=1,
        col="black", main="NDE across time", ylab="Log expression",
        xlab="Embryonic stage", log="y")
matplot(t(droso[sample(which(dds_res$padj < alpha),10),]+1), type="l", 
        lty=1,
        col="black", main="DE across time", ylab="Log expression",
        xlab="Embryonic stage", log="y")

## Subset data for clustering analysis: 
## only DE genes and genes with mean normalized count > 50
droso_counts <- droso[which(dds_res$padj < alpha & 
                          rowMeans(counts(dds, norm=TRUE)>50)),]
dim(droso_counts)

## Consider expression profiles for clustering
droso_profiles <- transform_RNAseq(droso_counts,
                                   norm=sizeFactors(dds),
                                   transformation="profile")$tcounts
head(round(droso_profiles,3))


##############################################
## K-means
##############################################

## K-means on profiles
K_choice <- 2:50
km <- vector("list", length(K_choice))
names(km) <- paste("K=", K_choice, sep="")
for(K in K_choice) {
  cat("K =", K, "...\n")
  km[[paste("K=",K,sep="")]] <- kmeans(droso_profiles, centers=K,
                                       nstart=5)
}

## Number of clusters unclear here: no elbow!
withinss <- unlist(lapply(km, function(x) x$tot.withinss))
betweenss <- unlist(lapply(km, function(x) x$betweenss))
totss <- unlist(lapply(km, function(x) x$totss))

par(mfrow=c(1,2), mar=c(4,4,1,1))
plot(K_choice, withinss, type="b", xlab="Number of Clusters",
     ylab="Within-group SS")
plot(K_choice, betweenss/totss, type="b", xlab="Number of Clusters",
     ylab="Between-group / Total SS")

## (Arbitrarily) choose model with K=20 clusters
km_final <- km[["K=20"]]
km_labels <- km_final$cluster
km_centers <- km_final$centers
km_size <- km_final$size
cat(" Total SS:\n  ", round(km_final$totss,2), "\n",
  "Within SS:\n  ", round(km_final$withinss,2), "\n",
  "Total within SS:\n  ", round(km_final$tot.withinss,2), "\n",
  "Between SS:\n  ", round(km_final$betweenss,2), "\n")

## Plot cluster centers
matplot(t(km_centers), type="l", col="black", lty=1, lwd=2,
        ylab="Kmeans cluster centers", xlab="Embryonic stage")


##############################################
## Hierarchical clustering
##############################################

droso_dist <- dist(droso_profiles, method="euclidean")
hc <- hclust(droso_dist)
plot(hc, labels=FALSE, xlab = "Euclidean distance, profiles")
abline(h=0.52, lty=2, col="red")

## Cut the tree for K=20 clusters
c <- cutree(hc, h=0.52)
table(c)

hclust_labels <- c
hclust_size <- table(c)


##############################################
## coseq: Gaussian mixture model w/ transformation
##############################################

## Takes about 2 min to run on 6 cores of Windows laptop
## Note: iter and init.runs artificially low for example run time!!
set.seed(12345)
start <- proc.time()[3]
coseq_norm <- coseq(droso_counts, K=2:40, model="Normal", 
                   transformation="arcsin",
                   norm=sizeFactors(dds), init.runs=3, iter=25, 
                   parallel=TRUE)
cat("Run time:", (proc.time()[3] - start)/60, "minutes\n")

## Examine results
summary(coseq_norm)
plot(coseq_norm)
## Reproduce specific graphs, if desired
plot(coseq_norm, order=TRUE, graphs=c("probapost_boxplots",
                                      "probapost_barplots"))
plot(coseq_norm, profiles_order=TRUE, graphs=c("boxplots"))

coseq_norm_labels <- apply(coseq_norm$results$ICL.results$probaPost, 1, 
                      which.max)

## Look at a few per-matrix correlation plots
param <- NormMixParam(coseq_norm)
par(mfrow=c(2,3))
for(k in 1:6) {
  tmp <- param$rho[,,k]
  colnames(tmp) <- rownames(tmp) <- as.character(conds)
  corrplot(tmp, main="")
  text(-2,15,paste("Cluster", k), font=2)
}


##############################################
## Compare clustering results
##############################################

## Compare K-means, hclust, coseq_norm clusterings
all_labels <-data.frame(coseq_norm=coseq_norm_labels,
                        hclust=hclust_labels,
                        Kmeans=km_labels)
compareARI(all_labels)

## Cross-tabulate of clusterings from coseq Norm and K-means
matchContTable(coseq_norm_labels, km_labels)