newfish<-read.table("../DATA/newfish.dat",h=T) newfish.test<-read.table("../DATA/newfish.test.txt",h=T) cor(newfish[,-1]) summary(newfish) # Parallel coordinates: Plotting each observation across all variables library(MASS) parcoord(newfish[,-1],col=c(1:7)) #classification tree install.packages(rpart) library(rpart) fish.control<-rpart.control(minsplit=10,minbucket=3,xval=0) newfish.treeorig<-rpart(Species ~ Weight+L1+L2+L3+Height+Width, data=newfish,method="class",control=fish.control) plot(newfish.treeorig) text(newfish.treeorig) names(newfish.treeorig) summary(newfish.treeorig) printcp(newfish.treeorig) plot(newfish.treeorig,branch=0,unif=T) text(newfish.treeorig) plot(newfish.treeorig,uniform=T,branch=0) text(newfish.treeorig,fancy=T) #include number of sujects in terminal nodes plot(newfish.treeorig,uniform=T,branch=0) text(newfish.treeorig,use.n=T) #estimate the apparent error rate orig.pred=predict(newfish.treeorig,newfish[,-1],type="class") orig.tab=table(newfish$Species,orig.pred) newfish.prunetree<-prune.rpart(newfish.treeorig,cp=.02) printcp(newfish.prunetree) plot(newfish.prunetree) text(newfish.prunetree) newfish.treenew<-rpart(Species ~ .,data=newfish,method="class", parms=list(split='information'),control=fish.control) printcp(newfish.treenew) new.pred=predict(newfish.treenew,newfish[,-1],type="class") new.tab=table(newfish$Species,new.pred) #cross-validation cv1=xpred.rpart(newfish.treeorig,xval=148) fish.control<-rpart.control(minbucket=3,minsplit=10,xval=148) newfish.treenewcv<-rpart(Species ~ .,data=newfish,method="class", parms=list(split='information'),control=fish.control) printcp(newfish.treenewcv) # xerror is the cross-validation error rate. There is some randomness # involved in the error rate estimate. cv.pred=predict(newfish.treenewcv,newfish[,-1],type="class") cv.tab=table(newfish$Species,cv.pred) plotcp(newfish.treenewcv) plot(newfish.treenewcv,uniform=T,branch=0) text(newfish.treenewcv,use.n=T) newfish.tpred<-predict(newfish.treenewcv,newfish.test) newfish.tpred #LDA library(MASS) newfish.lda<-lda(Species ~ Weight+L1+L2+L3+Height+Width+L21+L32+L31, data=newfish) newfish.lda<-lda(Species ~ Weight+L2+Height+Width+L21+L32, data=newfish) newfish.lda newfish.ldapred<-predict(newfish.lda,newfish[,-1]) table(newfish$Species,newfish.ldapred$class) eqscplot(newfish.ldapred$x,type="n",xlab="1st LD",ylab="2nd LD") fish.species<-c(rep("B",33),rep("W",5),rep("R",18),rep("Pa",10),rep("S",12), rep("Pi",16),rep("Pe",54)) text(newfish.ldapred$x[,1:2],fish.species) newfish.ldatest<-predict(newfish.lda,newfish.test) newfish.ldatest$class #QDA newfish.q1<-qda(Species ~ Weight+L1+L2+L3+Height+Width+L21+L32+L31, data=newfish) #remove group white from data newfishq=read.table("../DATA/newfishq.dat",h=T) #collinearity newfish.qda<-qda(Species ~ Weight+L1+L2+L3+Height+Width+L21+L32+L31, data=newfishq) newfish.qda<-qda(Species ~ Weight+L1+Height+Width+L21+L32,data=newfishq) newfish.qda table(newfishq$Species,predict(newfish.qda)$class) predict(newfish.qda,newfish.test)$class newfish.qdacv<-qda(Species ~ Weight+L1+Height+Width+L21+L32,data=newfishq,CV=T) newfish.qdacv table(newfishq$Species,newfish.qdacv$class) # nearest neighbor library(class) newfish.knn<-knn(newfish[,2:10],newfish[,2:10],newfish[,"Species"],k=3,prob=T) table(newfish$Species,newfish.knn) newfish.knn.cv<-knn.cv(newfish[,2:10],newfish[,"Species"],k=3,prob=T) table(newfish$Species,newfish.knn.cv) newfish1<-newfish[,c(1,2,3,6,8,9)] newfish.knn<-knn(newfish1[,2:6],newfish1[,2:6],newfish1[,"Species"],k=3,prob=T) table(newfish$Species,newfish.knn) sum(table(newfish$Species,newfish.knn))-sum(diag(table(newfish$Species,newfish.knn))) newfish.knn.cv<-knn.cv(newfish1[,2:6],newfish1[,2:6],newfish1[,"Species"],k=3,prob=T) table(newfish$Species,newfish.knn.cv) newfish1.test<-newfish.test[,c(1,2,5,7,8)] newfish.knntest<-knn(newfish1[,2:6],newfish1.test,newfish1[,"Species"],k=2,prob=T) newfish.knntest # attribute selection newfish2<-newfish[,c(1,2,3,6)] newfish.knn<-knn(newfish2[,-1],newfish2[,-1],newfish2[,"Species"],k=2,prob=T) nt=table(newfish$Species,newfish.knn) sum(nt)-sum(diag(nt)) newfish2=newfish newfish.knn<-knn(newfish2[,-1],newfish2[,-1],newfish2[,"Species"],k=2,prob=T) nt=table(newfish$Species,newfish.knn) sum(nt)-sum(diag(nt)) #logistic discriminant analysis library(nnet) newfish.logd<-multinom(Species ~ Weight+L1+L2+L3+Height+Width+L21+L32+L31,data=newfish,maxit=250) newfish.logd table(newfish$Species,predict(newfish.logd,newfish)) predict(newfish.logd,newfish.test) # Penalized logistic regression install.packages("stepPlr") library("stepPlr") y=rep(1,148) y[newfish$Species=="perch"]=0 newfish.plr <- plr(newfish[,c(2,4,6,7,8)],y,lambda=1) # SVM install.packages("e1071") install.packages("kernlab") install.packages("mlbench") library(e1071) library(kernlab) library(mlbench) data(iris) data(promotergene) # Package kernlab irismodel <- ksvm(Species ~ ., data = iris,type = "C-bsvc", kernel = "rbfdot", kpar = list(sigma = 0.1), C = 10, prob.model = TRUE) irismodel predict(irismodel, iris[c(3, 10, 56, 68,107, 120), -5], type = "probabilities") # user defined kernel function by just defining a function # which takes two vector arguments and returns its Hilbert Space # dot product in scalar form. k <- function(x, y) { (sum(x * y) + 1) * exp(0.001 * sum((x - y)^2)) } class(k) <- "kernel" gene <- ksvm(Class ~ ., data = promotergene, kernel = k, C = 10, cross = 5) gene # plot x <- rbind(matrix(rnorm(120), , 2), matrix(rnorm(120,mean = 3), , 2)) y <- matrix(c(rep(1, 60), rep(-1, 60))) svp <- ksvm(x, y, type = "C-svc", kernel = "rbfdot",kpar = list(sigma = 2)) plot(svp) # Package e1071 model <- svm(Species ~ ., data = iris, method = "C-classification", kernel = "radial",cost = 10, gamma = 0.1) summary(model) plot(model, iris, Petal.Width ~Petal.Length, slice = list(Sepal.Width = 3, Sepal.Length = 4)) pred <- predict(model, head(iris), decision.values = TRUE) attr(pred, "decision.values") # tune parameters tobj<-tune.svm(Species~., data=iris, gamma = 10^(-6:-1), cost = 10^(1:4)) summary(tobj) plot(tobj, transform.x = log10, xlab = expression(log[10](gamma)),ylab = "C") bestGamma <- tobj$best.parameters[[1]] bestC <- tobj$best.parameters[[2]] model <- svm(Species ~ ., data =iris,cost = bestC, gamma = bestGamma, cross = 10) summary(model) pred <- predict(model,iris) cc <- table(pred,iris$Species) classAgreement(cc) ###################################################################### #random forest, commands coded: # # MDSplot # classCenter # combine # getTree # grow # importance # imports85 # margin # na.roughfix # outlier # partialPlot # plot.randomForest # predict.randomForest # randomForest # rfImpute # rfNews # treesize # tuneRF # varImpPlot # varUsed # ##################################################################### install.packages("randomForest") library(randomForest) set.seed(71) newfish.rf <- randomForest(Species ~ . , data=newfish,ntree=1000,mtry=3,importance=TRUE,proximity=TRUE) print(newfish.rf) varUsed(newfish.rf) hist(treesize(newfish.rf)) dev.off() plot(margin(newfish.rf,newfish$Species)) print(round(newfish.rf$importance,2)) varImpPlot(newfish.rf) newfish.res <- tuneRF(newfish[,-1],newfish[,1],stepFactor=0.5) newfish.pred <- predict(newfish.rf,newfish.test) table(predicted=newfish.pred) newfish.pred