# Function to compute the Euclidean distance between vectors x and y d.Eucl <- function (x,y){ if(length(x) != length(y))stop("Vectors x and y must have the same length.") return(sqrt(sum((x-y)^2))) } # Function to compute the matrix of distances D for instances in matrix M. # D[i,j]=D[j,i] is the distance between vectors M[i,] and M[j,]. d.Eucl.mat <- function (M){ n <- nrow(M) rnames <- rownames(M) D <- matrix(0, nrow=n, ncol=n) rownames(D) <- colnames(D) <- rnames for(k in 1:(n-1)){ y <- as.vector(M[k,]) # computes the distances of the current vector to the remaining vectors D[k, k:n] <- apply(M[k:n, ], 1, function(x,y){return(d.Eucl(x,y))}, y = y) # distance is symmetric D[k:n, k] <- D[k, k:n] } return(D) } # Like d.Eucl.mat, but it uses the R library function dist, which calls C routines # implementing the loops. d.Eucl.mat2 <- function (M){ return(as.matrix(dist(M, method = "euclidean"))) } #Function to compute the everage number of wrong predictions in pred with regard # to labels in y. Both y and pred are -1/1 valued vectors. compute.avg.err <- function(y, pred){ return(sum(y!=pred)/length(y)) } # M nxd instance matrix. y -1/1 label vector, train and test numeric vectors # partitioning the rows of matrix M. # Output: the best k for kNN classifier train.kNN.test <- function(M,y,train,test){ n <- nrow(M) if(length(intersect(train, test))>0 || length(setdiff(1:n,union(train,test)))!=0) stop("train and test must constitute a partition in rows in M.") dist <- d.Eucl.mat2(M); k.vect <- 1:50; y.tmp <- y y.tmp[test] <- 0; # hiding test labels train.err <- rep(0, length(k.vect)) test.err <- rep(0, length(k.vect)) for(k in 1:length(k.vect)){ # predicting test instances with kNN pred <- apply(dist[test, ], 1, function(x, y, k){ b <- order(x, decreasing=FALSE) return(sign(sum(y[b[1:k]]))) }, y=y.tmp, k=k.vect[k]); # default -1, when having same number of pos and neg neighbours pred[pred==0] <- -1 test.err[k] <- compute.avg.err(y[test], pred) # predicting train instances pred <- apply(dist[train, train], 1, function(x, y, k){ b <- order(x, decreasing=FALSE) return(sign(sum(y[b[1:k]]))) }, y=y.tmp[train], k=k.vect[k]); pred[pred==0] <- -1 train.err[k] <- compute.avg.err(y[train], pred) } k.best <- which(test.err==min(test.err)) if(length(k.best) > 1) k.best <- k.best[which(train.err[k.best] == min(train.err[k.best]))] k.best<- k.best[1] png("train_test_err.png") plot(k.vect, train.err, xlim=range(k.vect), ylim=c(min(train.err,test.err),max(train.err,test.err)), ylab="Error rate", xlab="k", cex.lab=1.3, cex.axis=1.3, col="blue", type="l") lines(k.vect, test.err, cex.lab=1.3, cex.axis=1.3, col="red", lty=2) legend(x="topright", legend=c("Training error", "Test error"), col=c("blue", "red"), lty=1:2, cex=1.2) dev.off(); return(k.vect[k.best]) }