########## R script: planktonSVM ######### # For illustrating some support vector machine # classification of plankton species. # Last changed: 07 FEB 2007 # Load in `e1071' library; which contains # support vector machine software. library(e1071) # Define wait() function wait <- function(){cat("Hit return to continue\n"); ans <- readline()} # Read in the plankton data alldat <- read.table("plankton.dat") point.cols <- c("red","blue","green3","orange","purple") image.cols <- c("IndianRed1","cyan","PaleGreen","NavajoWhite","thistle") x1 <- alldat[,1] x2 <- alldat[,2] plank.type <- alldat[,3] species.names <- c("Dunaliella","Hemiselmis","Isochrysis", "Pavlova","Pyramimonas") X <- cbind(x1,x2) # Select a subset of 1000 observations to use for testing. # The remainder will be used as training data. n <- nrow(X) set.seed(349202) inds.test <- sample(1:n,1000,replace=F) inds.train <- setdiff(1:n,inds.test) # Plot the training data plot(x1[inds.train],x2[inds.train],pch=1,cex=0.3,lwd=2, col = point.cols[plank.type[inds.train]], xlab="forward scatter",ylab="red flour. under red light", main="training data") legend(1150,2200,legend=species.names,col=point.cols,pch=rep(16,5)) wait() # Obtain linear discriminant analysis (LDA) classification boundary. plankton.df <- data.frame(plank.type,x1,x2) plankton.df <- plankton.df[inds.train,] fit <- svm(factor(plank.type)~.,data=plankton.df,cost=1,gamma=1) # Display the result. mesh.size <- 101 x1.grid <- seq(min(x1),max(x1),length=mesh.size) x2.grid <- seq(min(x2),max(x2),length=mesh.size) x.mesh <- expand.grid(x1.grid,x2.grid) names(x.mesh) <- c("x1","x2") disc.est <- matrix(as.numeric(predict(fit,x.mesh)),mesh.size,mesh.size) image(x1.grid,x2.grid,disc.est,zlim=c(1,5),col=image.cols, xlab="forward scatter",ylab="red flour. under red light", main="support vector machine (SVM)") wait() points(x1[inds.test],x2[inds.test],pch=1,cex=0.3, lwd=2,col = point.cols[plank.type[inds.test]]) wait() # Obtain the confusion matrix and misclassification rate x.test <- X[inds.test,] names(x.test) <- c("x1","x2") pred.classes <- predict(fit,x.test) true.classes <- plank.type[inds.test] confus.mat <- table(pred.classes,true.classes) dimnames(confus.mat)[[1]] <- species.names dimnames(confus.mat)[[2]] <- species.names cat("The confusion matrix is\n") print(confus.mat) misclass.rate <- (sum(confus.mat) - sum(diag(confus.mat)))/1000 cat("\n\n The missclassification rate is (as a percentage) ") cat(round(100*misclass.rate,2),"\n\n") ########## End of planktonSVM.Rs #########