## R environment: packages, working directory
# Install new packages with R (not run: they are already installed)
# install.packages(c("nnet","e1071","rpart","car","randomForest"))
# Set working directory (has to be modified to work with a different environment)
setwd("home/fp/Bureau/inra-package/ML/TP")

## Introduction
# Loading genes expressions
d <- read.table("../../Data/sel_data.csv",sep=";",header=T,row.names=1,dec=",")
dim(d)
names(d)
summary(d)

# Loading pH
pH <- scan("../../Data/sel_ph.csv",dec=",")
length(pH)
summary(pH)

# Data distribution
layout(matrix(c(1,1,2),ncol=3))
boxplot(d,main="genes expressions")
boxplot(pH,main="pH")

# Correlation analysis
heatmap(as.matrix(d))

# Correlation with pH
pH.cor <- function(x) {cor(x,pH)}
totalcor <- apply(d,1,pH.cor)
hist(totalcor,main="Correlations between genes expression\n and pH",xlab="Correlations")

# Classes definition
pH.classes <- rep(0,length(pH))
pH.classes[pH>5.7] <- 1
table(pH.classes)

# Training / Test sets split: regression framework
set.seed(16011357)
training <- sample(1:ncol(d),round(0.8*ncol(d)),replace=F)
d.train <- t(d[,training])
pH.train <- pH[training]
d.test <- t(d[,-training])
pH.test <- pH[-training]
r.train <- cbind(d.train,pH.train)
r.test <- cbind(d.test,pH.test)
colnames(r.train) <- c(colnames(d.train),"pH")
colnames(r.test) <- c(colnames(d.train),"pH")
r.train <- data.frame(r.train)
r.test <- data.frame(r.test)

# Training / Test sets split: classification framework
pHc.train <- pH.classes[training]
pHc.test <- pH.classes[-training]
c.train <- cbind(d.train,pHc.train)
c.test <- cbind(d.test,pHc.test)
colnames(c.train) <- c(colnames(d.train),"pHc")
colnames(c.test) <- c(colnames(d.train),"pHc")
c.train <- data.frame(c.train)
c.test <- data.frame(c.test)
c.train$pHc <- factor(c.train$pHc)
c.test$pHc <- factor(c.test$pHc)

# Training / Test sets (short) analysis
layout(matrix(c(1,1,2,3,3,4),ncol=3,nrow=2,byrow=T))
boxplot(d.train,main="genes expressions (train)")
boxplot(pH.train,main="pH (train)")
boxplot(d.test,main="genes expressions (test)")
boxplot(pH.test,main="pH (test)")
table(pHc.train)
table(pHc.test)

## Neural network
# Loading the data
load("../../Data/train-test.Rdata")
load("../../Data/selected.Rdata")
library(nnet)

# Simple use: MLP with p=3 and no decay (regression framework)
set.seed(17011644)
nn1 <- nnet(d.train[,selected],pH.train,size=3,decay=0,maxit=500,linout=T)
# Analysis
print(nn1)
summary(nn1)
# Training error and pseudo-R²
mean((pH.train-nn1$fitted)^2)
1-mean((pH.train-nn1$fitted)^2)/var(pH.train)
# Predictions (test set)
pred.test <- predict(nn1,d.test[,selected])
# Test error and pseudo-R²
mean((pH.test-pred.test)^2)
1-mean((pH.test-pred.test)^2)/var(pH.train)
# Fitted values vs True values
plot(pH.train,nn1$fitted,xlab="Observations",ylab="Fitted values",main="",pch=3,col="blue")
points(pH.test,pred.test,pch=19,col="darkred")
legend("bottomright",pch=c(3,19),col=c("blue","darkred"),legend=c("Train","Test"))
abline(0,1,col="darkgreen")

# Do it again to see the variability in the performances
set.seed(17012120)
nn2 <- nnet(d.train[,selected],pH.train,size=3,decay=0,maxit=500,linout=T)
# Analysis
print(nn2)
summary(nn2)
# Training error
mean((pH.train-nn2$fitted)^2)
1-mean((pH.train-nn2$fitted)^2)/var(pH.train)
# Predictions (test set)
pred.test <- predict(nn2,d.test[,selected])
# (test) MSE and pseudo-R²
mean((pH.test-pred.test)^2)
1-mean((pH.test-pred.test)^2)/var(pH.train)
# Fitted values vs True values
plot(pH.train,nn2$fitted,xlab="Observations",ylab="Fitted values",main="",pch=3,col="blue")
points(pH.test,pred.test,pch=19,col="darkred")
legend("bottomright",pch=c(3,19),col=c("blue","darkred"),legend=c("Train","Test"))
abline(0,1,col="darkgreen")

# simple use: MLP with p=3 and no decay (classification framework)
set.seed(17011716)
nnc1 <- nnet(pHc~.,data=c.train,size=3,decay=0,maxit=500,linout=F)
# Analysis
print(nnc1)
summary(nnc1)
nnc1$fitted
# Training error
pred.train <- rep(0,length(nnc1$fitted))
pred.train[nnc1$fitted>0.5] <- 1
table(pred.train,pHc.train)
sum(pred.train!=pHc.train)/length(pred.train)
# Predictions and test error
raw.pred.test <- predict(nnc1,c.test)
pred.test <- rep(0,length(raw.pred.test))
pred.test[raw.pred.test>0.5] <- 1
table(pred.test,pHc.test)
sum(pred.test!=pHc.test)/length(pred.test)

# tuning MLP with e1071 by means of CV
library(e1071)
set.seed(1643)
t.nnc2 <- tune.nnet(pHc~.,data=c.train[,c(selected,ncol(c.train))],size=c(2,5,10),decay=10^(-c(10,8,6,4,2)),maxit=500,linout=F,tunecontrol = tune.control(nrepeat=5,sampling="cross",cross=10))
# Analysis
plot(t.nnc2)
summary(t.nnc2)
t.nnc2$best.parameters
# Selecting the best MLP
nnc2 <- t.nnc2$best.model
# Training error
pred.train <- rep(0,length(nnc2$fitted))
pred.train[nnc2$fitted>0.5] <- 1
table(pred.train,pHc.train)
sum(pred.train!=pHc.train)/length(pred.train)
# Predictions and test error
raw.pred.test <- predict(nnc2,c.test)
pred.test <- rep(0,length(raw.pred.test))
pred.test[raw.pred.test>0.5] <- 1
table(pred.test,pHc.test)
sum(pred.test!=pHc.test)/length(pred.test)

## CART
# Loading rpart package
library(rpart)
# Regression tree
tree1 <- rpart(pH~.,data=r.train,control=rpart.control(minsplit=2))
print(tree1)
summary(tree1)
plot(tree1)
text(tree1)
tree1$where # leaf number
tree1$frame # nodes features
# Training predictions and error
pred.train <- tree1$frame$yval[tree1$where] # predicted values (train set)
mean((pred.train-r.train$pH)^2)
1-mean((pred.train-r.train$pH)^2)/var(pH.train)
# Test predictions and error
pred.test <- predict(tree1,r.test)
mean((pred.test-r.test$pH)^2)
1-mean((pred.test-r.test$pH)^2)/var(pH.train)
# Fitted values vs True values
plot(pH.train,pred.train,xlab="Observations",ylab="Fitted values",main="",pch=3,col="blue")
points(pH.test,pred.test,pch=19,col="darkred")
legend("bottomright",pch=c(3,19),col=c("blue","darkred"),legend=c("Train","Test"))
abline(0,1,col="darkgreen")

# Classification tree and tuning with e1071
set.seed(20011108)
t.treec1 <- tune.rpart(pHc~.,data=c.train,minsplit=c(4,6,8,10),tunecontrol=tune.control(sampling="bootstrap",nboot=20))
plot(t.treec1)
t.treec1$best.parameters
treec1 <- t.treec1$best.model
summary(treec1)
plot(treec1)
text(treec1)
treec1$where # leaf number
treec1$frame # nodes features
# Training predictions and error
pred.train <- predict(treec1,c.train) # predicted values (train set)
pred.train
pred.train <- apply(pred.train,1,which.max)
pred.train
library(car) # to have the "recode" function
pred.train <- recode(pred.train,"2=1;1=0")
table(pred.train,pHc.train)
sum(pred.train!=pHc.train)/length(pred.train)
# Test predictions and error
pred.test <- predict(treec1,c.test)
pred.test <- apply(pred.test,1,which.max)
pred.test <- recode(pred.test,"2=1;1=0")
table(pred.test,pHc.test)
sum(pred.test!=pHc.test)/length(pred.test)

## random forest
# basic use (regression framework)
library(randomForest)
set.seed(21011144)
rf1 <- randomForest(d.train,pH.train,importance=T,keep.forest=T)
plot(rf1)
rf1$ntree
rf1$mtry
rf1$importance
# Predictions and errors
mean((pH.train-rf1$predicted)^2)
rf1$mse
1-mean((pH.train-rf1$predicted)^2)/var(pH.train)
1-mean((pH.train-predict(rf1,d.train))^2)/var(pH.train)
pred.test <- predict(rf1,d.test)
mean((pH.test-pred.test)^2)
1-mean((pH.test-pred.test)^2)/var(pH.train)
# Fitted values vs True values
plot(pH.train,rf1$predicted,xlab="Observations",ylab="Fitted values",main="",pch=3,col="blue")
points(pH.test,pred.test,pch=19,col="darkred")
legend("bottomright",pch=c(3,19),col=c("blue","darkred"),legend=c("Train","Test"))
abline(0,1,col="darkgreen")
# Importance analysis
layout(matrix(c(1,2),ncol=2))
barplot(t(rf1$importance[,1]),xlab="variables",ylab="% Inc. MSE",col="darkred",las=2,names=rep(NA,nrow(rf1$importance)))
barplot(t(rf1$importance[,2]),xlab="variables",ylab="Inc. Node Purity",col="darkred",las=2,names=rep(NA,nrow(rf1$importance)))
which(rf1$importance[,1]>0.0005)

# more advanced features (classification framework)
set.seed(20011203)
rfc1 <- randomForest(d.train,factor(pHc.train),ntree=5000,mtry=150,sampsize=40,nodesize=2,xtest=d.test,ytest=factor(pHc.test),importance=T,keep.forest=F)
plot(rfc1)
rfc1$importance
# Predictions and error
table(rfc1$predicted,pHc.train)
sum(rfc1$predicted!=pHc.train)/length(pHc.train)
table(rfc1$test$predicted,pHc.test)
sum(rfc1$test$predicted!=pHc.test)/length(pHc.test)
# Importance analysis
layout(matrix(c(1,2),ncol=2))
barplot(t(rfc1$importance[,3]),xlab="variables",ylab="Mean Dec. Accu.",col="darkred",las=2,names=rep(NA,nrow(rfc1$importance)))
barplot(t(rfc1$importance[,4]),xlab="variables",ylab="Mean Dec. Gini",col="darkred",las=2,names=rep(NA,nrow(rfc1$importance)))
which(rfc1$importance[,3]>0.002)