newfish<-read.table("../DATA/newfish.dat",h=T)
newfish.test<-read.table("../DATA/newfish.test.txt",h=T)

cor(newfish[,-1])
summary(newfish)

# Parallel coordinates: Plotting each observation across all variables

library(MASS)
parcoord(newfish[,-1],col=c(1:7)) 

#classification tree

install.packages(rpart)
 
library(rpart)
fish.control<-rpart.control(minsplit=10,minbucket=3,xval=0)
newfish.treeorig<-rpart(Species ~ Weight+L1+L2+L3+Height+Width,
    data=newfish,method="class",control=fish.control)

plot(newfish.treeorig)
text(newfish.treeorig)

names(newfish.treeorig)
summary(newfish.treeorig)
printcp(newfish.treeorig)

plot(newfish.treeorig,branch=0,unif=T)
text(newfish.treeorig)

plot(newfish.treeorig,uniform=T,branch=0)
text(newfish.treeorig,fancy=T)

#include number of sujects in terminal nodes

plot(newfish.treeorig,uniform=T,branch=0)
text(newfish.treeorig,use.n=T)

#estimate the apparent error rate

orig.pred=predict(newfish.treeorig,newfish[,-1],type="class")
orig.tab=table(newfish$Species,orig.pred)

newfish.prunetree<-prune.rpart(newfish.treeorig,cp=.02)
printcp(newfish.prunetree)
plot(newfish.prunetree)
text(newfish.prunetree)

newfish.treenew<-rpart(Species ~ .,data=newfish,method="class",
        parms=list(split='information'),control=fish.control)
printcp(newfish.treenew)
new.pred=predict(newfish.treenew,newfish[,-1],type="class")
new.tab=table(newfish$Species,new.pred)

#cross-validation

cv1=xpred.rpart(newfish.treeorig,xval=148)

fish.control<-rpart.control(minbucket=3,minsplit=10,xval=148)
newfish.treenewcv<-rpart(Species ~ .,data=newfish,method="class",
        parms=list(split='information'),control=fish.control)

printcp(newfish.treenewcv)

# xerror is the cross-validation error rate. There is some randomness 
# involved in the error rate estimate. 

cv.pred=predict(newfish.treenewcv,newfish[,-1],type="class")
cv.tab=table(newfish$Species,cv.pred)

plotcp(newfish.treenewcv)

plot(newfish.treenewcv,uniform=T,branch=0)
text(newfish.treenewcv,use.n=T)

newfish.tpred<-predict(newfish.treenewcv,newfish.test)
newfish.tpred

#LDA 

library(MASS)
newfish.lda<-lda(Species ~ Weight+L1+L2+L3+Height+Width+L21+L32+L31,
  data=newfish)

newfish.lda<-lda(Species ~ Weight+L2+Height+Width+L21+L32,
   data=newfish)
newfish.lda

newfish.ldapred<-predict(newfish.lda,newfish[,-1])
table(newfish$Species,newfish.ldapred$class)
       
eqscplot(newfish.ldapred$x,type="n",xlab="1st LD",ylab="2nd LD")
fish.species<-c(rep("B",33),rep("W",5),rep("R",18),rep("Pa",10),rep("S",12),
     rep("Pi",16),rep("Pe",54))
text(newfish.ldapred$x[,1:2],fish.species)
 
newfish.ldatest<-predict(newfish.lda,newfish.test)
newfish.ldatest$class

#QDA
newfish.q1<-qda(Species ~ Weight+L1+L2+L3+Height+Width+L21+L32+L31,
  data=newfish)

#remove group white from data
newfishq=read.table("../DATA/newfishq.dat",h=T)

#collinearity
newfish.qda<-qda(Species ~ Weight+L1+L2+L3+Height+Width+L21+L32+L31,
  data=newfishq)

newfish.qda<-qda(Species ~ Weight+L1+Height+Width+L21+L32,data=newfishq)
newfish.qda

table(newfishq$Species,predict(newfish.qda)$class)
predict(newfish.qda,newfish.test)$class

newfish.qdacv<-qda(Species ~ Weight+L1+Height+Width+L21+L32,data=newfishq,CV=T)
newfish.qdacv
table(newfishq$Species,newfish.qdacv$class)

# nearest neighbor

library(class)
newfish.knn<-knn(newfish[,2:10],newfish[,2:10],newfish[,"Species"],k=3,prob=T)
table(newfish$Species,newfish.knn)

newfish.knn.cv<-knn.cv(newfish[,2:10],newfish[,"Species"],k=3,prob=T)
table(newfish$Species,newfish.knn.cv)

newfish1<-newfish[,c(1,2,3,6,8,9)]
newfish.knn<-knn(newfish1[,2:6],newfish1[,2:6],newfish1[,"Species"],k=3,prob=T)
table(newfish$Species,newfish.knn)
sum(table(newfish$Species,newfish.knn))-sum(diag(table(newfish$Species,newfish.knn)))

newfish.knn.cv<-knn.cv(newfish1[,2:6],newfish1[,2:6],newfish1[,"Species"],k=3,prob=T)
table(newfish$Species,newfish.knn.cv)

newfish1.test<-newfish.test[,c(1,2,5,7,8)]
newfish.knntest<-knn(newfish1[,2:6],newfish1.test,newfish1[,"Species"],k=2,prob=T)
newfish.knntest

# attribute selection

newfish2<-newfish[,c(1,2,3,6)]
newfish.knn<-knn(newfish2[,-1],newfish2[,-1],newfish2[,"Species"],k=2,prob=T)
nt=table(newfish$Species,newfish.knn)
sum(nt)-sum(diag(nt))

newfish2=newfish
newfish.knn<-knn(newfish2[,-1],newfish2[,-1],newfish2[,"Species"],k=2,prob=T)
nt=table(newfish$Species,newfish.knn)
sum(nt)-sum(diag(nt))

#logistic discriminant analysis

library(nnet)
newfish.logd<-multinom(Species ~ Weight+L1+L2+L3+Height+Width+L21+L32+L31,data=newfish,maxit=250)
newfish.logd
 
table(newfish$Species,predict(newfish.logd,newfish))     
       
predict(newfish.logd,newfish.test)

# Penalized logistic regression

install.packages("stepPlr")

library("stepPlr")

y=rep(1,148)
y[newfish$Species=="perch"]=0
newfish.plr <- plr(newfish[,c(2,4,6,7,8)],y,lambda=1)

# SVM 

install.packages("e1071")
install.packages("kernlab")
install.packages("mlbench")

library(e1071)
library(kernlab)
library(mlbench)

data(iris)
data(promotergene)

# Package kernlab

irismodel <- ksvm(Species ~ ., data = iris,type = "C-bsvc", kernel = "rbfdot",
   	kpar = list(sigma = 0.1), C = 10, prob.model = TRUE)

irismodel

predict(irismodel, iris[c(3, 10, 56, 68,107, 120), -5], type = "probabilities")

# user defined kernel function by just defining a function
# which takes two vector arguments and returns its Hilbert Space 
# dot product in scalar form.

k <- function(x, y) {
 (sum(x * y) + 1) * exp(0.001 * sum((x - y)^2))
}

class(k) <- "kernel"
gene <- ksvm(Class ~ ., data = promotergene, kernel = k, C = 10, cross = 5)
gene

# plot
x <- rbind(matrix(rnorm(120), , 2), matrix(rnorm(120,mean = 3), , 2))
y <- matrix(c(rep(1, 60), rep(-1, 60)))
svp <- ksvm(x, y, type = "C-svc", kernel = "rbfdot",kpar = list(sigma = 2))
plot(svp)

# Package e1071

model <- svm(Species ~ ., data = iris, method = "C-classification", 
	kernel = "radial",cost = 10, gamma = 0.1)
summary(model)

plot(model, iris, Petal.Width ~Petal.Length, slice = list(Sepal.Width = 3,
 Sepal.Length = 4))

pred <- predict(model, head(iris), decision.values = TRUE)
attr(pred, "decision.values")

# tune parameters

tobj<-tune.svm(Species~., data=iris, gamma = 10^(-6:-1), cost = 10^(1:4))
summary(tobj)
plot(tobj, transform.x = log10, xlab = expression(log[10](gamma)),ylab = "C")

bestGamma <- tobj$best.parameters[[1]]
bestC <- tobj$best.parameters[[2]]
model <- svm(Species ~ ., data =iris,cost = bestC, gamma = bestGamma, cross = 10)
summary(model)
pred <- predict(model,iris)
cc <- table(pred,iris$Species)
classAgreement(cc)


######################################################################
#random forest, commands coded:											
#
# MDSplot                          
# classCenter                       
# combine                           
# getTree                           
# grow                              
# importance                       
# imports85                        
# margin                            
# na.roughfix                       
# outlier                           
# partialPlot                       
# plot.randomForest                 
# predict.randomForest              
# randomForest                      
# rfImpute                          
# rfNews                            
# treesize                          
# tuneRF                            
# varImpPlot                        
# varUsed                           
#
#####################################################################
 
install.packages("randomForest")

library(randomForest)
set.seed(71)

newfish.rf <- randomForest(Species ~ . , data=newfish,ntree=1000,mtry=3,importance=TRUE,proximity=TRUE)

print(newfish.rf)
varUsed(newfish.rf)
hist(treesize(newfish.rf))
dev.off()

plot(margin(newfish.rf,newfish$Species))

print(round(newfish.rf$importance,2))

varImpPlot(newfish.rf)

newfish.res <- tuneRF(newfish[,-1],newfish[,1],stepFactor=0.5)

newfish.pred <- predict(newfish.rf,newfish.test)

table(predicted=newfish.pred)
newfish.pred