## R environment: packages, working directory # Install new packages with R (not run: they are already installed) # install.packages(c("nnet","e1071","rpart","car","randomForest")) # Set working directory (has to be modified to work with a different environment) setwd("home/fp/Bureau/inra-package/ML/TP") ## Introduction # Loading genes expressions d <- read.table("../../Data/sel_data.csv",sep=";",header=T,row.names=1,dec=",") dim(d) names(d) summary(d) # Loading pH pH <- scan("../../Data/sel_ph.csv",dec=",") length(pH) summary(pH) # Data distribution layout(matrix(c(1,1,2),ncol=3)) boxplot(d,main="genes expressions") boxplot(pH,main="pH") # Correlation analysis heatmap(as.matrix(d)) # Correlation with pH pH.cor <- function(x) {cor(x,pH)} totalcor <- apply(d,1,pH.cor) hist(totalcor,main="Correlations between genes expression\n and pH",xlab="Correlations") # Classes definition pH.classes <- rep(0,length(pH)) pH.classes[pH>5.7] <- 1 table(pH.classes) # Training / Test sets split: regression framework set.seed(16011357) training <- sample(1:ncol(d),round(0.8*ncol(d)),replace=F) d.train <- t(d[,training]) pH.train <- pH[training] d.test <- t(d[,-training]) pH.test <- pH[-training] r.train <- cbind(d.train,pH.train) r.test <- cbind(d.test,pH.test) colnames(r.train) <- c(colnames(d.train),"pH") colnames(r.test) <- c(colnames(d.train),"pH") r.train <- data.frame(r.train) r.test <- data.frame(r.test) # Training / Test sets split: classification framework pHc.train <- pH.classes[training] pHc.test <- pH.classes[-training] c.train <- cbind(d.train,pHc.train) c.test <- cbind(d.test,pHc.test) colnames(c.train) <- c(colnames(d.train),"pHc") colnames(c.test) <- c(colnames(d.train),"pHc") c.train <- data.frame(c.train) c.test <- data.frame(c.test) c.train$pHc <- factor(c.train$pHc) c.test$pHc <- factor(c.test$pHc) # Training / Test sets (short) analysis layout(matrix(c(1,1,2,3,3,4),ncol=3,nrow=2,byrow=T)) boxplot(d.train,main="genes expressions (train)") boxplot(pH.train,main="pH (train)") boxplot(d.test,main="genes expressions (test)") boxplot(pH.test,main="pH (test)") table(pHc.train) table(pHc.test) ## Neural network # Loading the data load("../../Data/train-test.Rdata") load("../../Data/selected.Rdata") library(nnet) # Simple use: MLP with p=3 and no decay (regression framework) set.seed(17011644) nn1 <- nnet(d.train[,selected],pH.train,size=3,decay=0,maxit=500,linout=T) # Analysis print(nn1) summary(nn1) # Training error and pseudo-R² mean((pH.train-nn1$fitted)^2) 1-mean((pH.train-nn1$fitted)^2)/var(pH.train) # Predictions (test set) pred.test <- predict(nn1,d.test[,selected]) # Test error and pseudo-R² mean((pH.test-pred.test)^2) 1-mean((pH.test-pred.test)^2)/var(pH.train) # Fitted values vs True values plot(pH.train,nn1$fitted,xlab="Observations",ylab="Fitted values",main="",pch=3,col="blue") points(pH.test,pred.test,pch=19,col="darkred") legend("bottomright",pch=c(3,19),col=c("blue","darkred"),legend=c("Train","Test")) abline(0,1,col="darkgreen") # Do it again to see the variability in the performances set.seed(17012120) nn2 <- nnet(d.train[,selected],pH.train,size=3,decay=0,maxit=500,linout=T) # Analysis print(nn2) summary(nn2) # Training error mean((pH.train-nn2$fitted)^2) 1-mean((pH.train-nn2$fitted)^2)/var(pH.train) # Predictions (test set) pred.test <- predict(nn2,d.test[,selected]) # (test) MSE and pseudo-R² mean((pH.test-pred.test)^2) 1-mean((pH.test-pred.test)^2)/var(pH.train) # Fitted values vs True values plot(pH.train,nn2$fitted,xlab="Observations",ylab="Fitted values",main="",pch=3,col="blue") points(pH.test,pred.test,pch=19,col="darkred") legend("bottomright",pch=c(3,19),col=c("blue","darkred"),legend=c("Train","Test")) abline(0,1,col="darkgreen") # simple use: MLP with p=3 and no decay (classification framework) set.seed(17011716) nnc1 <- nnet(pHc~.,data=c.train,size=3,decay=0,maxit=500,linout=F) # Analysis print(nnc1) summary(nnc1) nnc1$fitted # Training error pred.train <- rep(0,length(nnc1$fitted)) pred.train[nnc1$fitted>0.5] <- 1 table(pred.train,pHc.train) sum(pred.train!=pHc.train)/length(pred.train) # Predictions and test error raw.pred.test <- predict(nnc1,c.test) pred.test <- rep(0,length(raw.pred.test)) pred.test[raw.pred.test>0.5] <- 1 table(pred.test,pHc.test) sum(pred.test!=pHc.test)/length(pred.test) # tuning MLP with e1071 by means of CV library(e1071) set.seed(1643) t.nnc2 <- tune.nnet(pHc~.,data=c.train[,c(selected,ncol(c.train))],size=c(2,5,10),decay=10^(-c(10,8,6,4,2)),maxit=500,linout=F,tunecontrol = tune.control(nrepeat=5,sampling="cross",cross=10)) # Analysis plot(t.nnc2) summary(t.nnc2) t.nnc2$best.parameters # Selecting the best MLP nnc2 <- t.nnc2$best.model # Training error pred.train <- rep(0,length(nnc2$fitted)) pred.train[nnc2$fitted>0.5] <- 1 table(pred.train,pHc.train) sum(pred.train!=pHc.train)/length(pred.train) # Predictions and test error raw.pred.test <- predict(nnc2,c.test) pred.test <- rep(0,length(raw.pred.test)) pred.test[raw.pred.test>0.5] <- 1 table(pred.test,pHc.test) sum(pred.test!=pHc.test)/length(pred.test) ## CART # Loading rpart package library(rpart) # Regression tree tree1 <- rpart(pH~.,data=r.train,control=rpart.control(minsplit=2)) print(tree1) summary(tree1) plot(tree1) text(tree1) tree1$where # leaf number tree1$frame # nodes features # Training predictions and error pred.train <- tree1$frame$yval[tree1$where] # predicted values (train set) mean((pred.train-r.train$pH)^2) 1-mean((pred.train-r.train$pH)^2)/var(pH.train) # Test predictions and error pred.test <- predict(tree1,r.test) mean((pred.test-r.test$pH)^2) 1-mean((pred.test-r.test$pH)^2)/var(pH.train) # Fitted values vs True values plot(pH.train,pred.train,xlab="Observations",ylab="Fitted values",main="",pch=3,col="blue") points(pH.test,pred.test,pch=19,col="darkred") legend("bottomright",pch=c(3,19),col=c("blue","darkred"),legend=c("Train","Test")) abline(0,1,col="darkgreen") # Classification tree and tuning with e1071 set.seed(20011108) t.treec1 <- tune.rpart(pHc~.,data=c.train,minsplit=c(4,6,8,10),tunecontrol=tune.control(sampling="bootstrap",nboot=20)) plot(t.treec1) t.treec1$best.parameters treec1 <- t.treec1$best.model summary(treec1) plot(treec1) text(treec1) treec1$where # leaf number treec1$frame # nodes features # Training predictions and error pred.train <- predict(treec1,c.train) # predicted values (train set) pred.train pred.train <- apply(pred.train,1,which.max) pred.train library(car) # to have the "recode" function pred.train <- recode(pred.train,"2=1;1=0") table(pred.train,pHc.train) sum(pred.train!=pHc.train)/length(pred.train) # Test predictions and error pred.test <- predict(treec1,c.test) pred.test <- apply(pred.test,1,which.max) pred.test <- recode(pred.test,"2=1;1=0") table(pred.test,pHc.test) sum(pred.test!=pHc.test)/length(pred.test) ## random forest # basic use (regression framework) library(randomForest) set.seed(21011144) rf1 <- randomForest(d.train,pH.train,importance=T,keep.forest=T) plot(rf1) rf1$ntree rf1$mtry rf1$importance # Predictions and errors mean((pH.train-rf1$predicted)^2) rf1$mse 1-mean((pH.train-rf1$predicted)^2)/var(pH.train) 1-mean((pH.train-predict(rf1,d.train))^2)/var(pH.train) pred.test <- predict(rf1,d.test) mean((pH.test-pred.test)^2) 1-mean((pH.test-pred.test)^2)/var(pH.train) # Fitted values vs True values plot(pH.train,rf1$predicted,xlab="Observations",ylab="Fitted values",main="",pch=3,col="blue") points(pH.test,pred.test,pch=19,col="darkred") legend("bottomright",pch=c(3,19),col=c("blue","darkred"),legend=c("Train","Test")) abline(0,1,col="darkgreen") # Importance analysis layout(matrix(c(1,2),ncol=2)) barplot(t(rf1$importance[,1]),xlab="variables",ylab="% Inc. MSE",col="darkred",las=2,names=rep(NA,nrow(rf1$importance))) barplot(t(rf1$importance[,2]),xlab="variables",ylab="Inc. Node Purity",col="darkred",las=2,names=rep(NA,nrow(rf1$importance))) which(rf1$importance[,1]>0.0005) # more advanced features (classification framework) set.seed(20011203) rfc1 <- randomForest(d.train,factor(pHc.train),ntree=5000,mtry=150,sampsize=40,nodesize=2,xtest=d.test,ytest=factor(pHc.test),importance=T,keep.forest=F) plot(rfc1) rfc1$importance # Predictions and error table(rfc1$predicted,pHc.train) sum(rfc1$predicted!=pHc.train)/length(pHc.train) table(rfc1$test$predicted,pHc.test) sum(rfc1$test$predicted!=pHc.test)/length(pHc.test) # Importance analysis layout(matrix(c(1,2),ncol=2)) barplot(t(rfc1$importance[,3]),xlab="variables",ylab="Mean Dec. Accu.",col="darkred",las=2,names=rep(NA,nrow(rfc1$importance))) barplot(t(rfc1$importance[,4]),xlab="variables",ylab="Mean Dec. Gini",col="darkred",las=2,names=rep(NA,nrow(rfc1$importance))) which(rfc1$importance[,3]>0.002)