###############################################
## MACHINE LEARNING WORKSHOP
## SCRIPT 05 - K NEAREST NEIGHBORS
###############################################

### LOAD REQUIRED PACKAGES
library(kknn)

### CLASSIFICATION
train <- read.csv("https://drive.switch.ch/index.php/s/8nNrBeeIOxilaKP/download",header=TRUE,as.is=FALSE)
head(train) ; dim(train)
test <- read.csv("https://drive.switch.ch/index.php/s/wS6WmzMYCMTRgzH/download",header=TRUE,as.is=FALSE)
head(test) ; dim(test)

par(mar=c(5,5,1,1),cex.axis=1.2,cex.lab=1.5,mfrow=c(1,2))
plot(train[,1:2],col=as.character(train[,3]),xlim=range(test$X1),ylim=range(test$X2),pch=18,cex=1.2)
plot(test[,1:2],col=as.character(test[,3]),pch="~")

onenn <- kknn(Class~X1+X2,train=train,k=1,test=test)

dist(train)

par(mar=c(5,5,1,1),cex.axis=1.2,cex.lab=1.5,mfrow=c(1,2))
plot(test[,1:2],col=as.character(test[,3]),pch="~")
points(train[,1:2],col=as.character(train[,3]),pch=18,cex=1.2)
plot(test[,1:2],col=as.character(onenn$fitted),pch="~")

table(True=test$Class,NN1=nn1$fitted)
mean(test$Class!=nn1$fitted)

train.kknn(Class~.,data=train,kmax=101)

nn8 <- kknn(Class~X1+X2,train=train,k=8,test=test)
mean(test$Class!=nn8$fitted)

par(mar=c(5,5,1,1),cex.axis=1.2,cex.lab=1.5,mfrow=c(1,2))
plot(test[,1:2],col=as.character(test[,3]),pch="~")
points(train[,1:2],col=as.character(train[,3]),pch=18,cex=1.2)
plot(test[,1:2],col=as.character(nn8$fitted),pch="~")


### KNN SENSITIVITY TO NOISE
set.seed(1234)
rtrain <- data.frame(train,matrix(rnorm(200*20),200,20))
rtest <- data.frame(test,matrix(rnorm(3415*20),3415,20))

nn8 <- kknn(Class~.,train=rtrain,k=8,test=rtest)

par(mar=c(5,5,1,1),cex.axis=1.2,cex.lab=1.5,mfrow=c(1,2))
plot(test[,1:2],col=as.character(test[,3]),pch="~")
points(train[,1:2],col=as.character(train[,3]),pch=18,cex=1.2)
plot(test[,1:2],col=as.character(nn8$fitted),pch="~")


### REGRESSION
set.seed(668)
x <- 1:500
true <- 100*sin(0.02*x) + 0.5*x
y <- true + rnorm(500,0,25)
par(mar=c(5,5,1,1),cex.lab=1.2,cex.axis=1.2)
plot(x,y,pch="+",col="grey70",xlab="X",ylab="Y",cex=1.2)

xval <- 1:5000/10
train.kknn(y~x,data=data.frame(x,y),kmax=101)

lin <- lm(y~x)
knn1 <- kknn(y~x,train=data.frame(x,y),test=data.frame(x=xval),k=1)
knn24 <- kknn(y~x,train=data.frame(x,y),test=data.frame(x=xval),k=24)
knn300 <- kknn(y~x,train=data.frame(x,y),test=data.frame(x=xval),k=300)

predlin <- predict(lin,newdata=data.frame(x=xval))
predpoly <- predict(poly4,newdata=data.frame(x=xval))

par(mfrow=c(1,3),mar=c(4,5,3,0.5),cex.lab=1.5,cex.axis=1.5,cex.main=1.5)
plot(x,y,pch="+",col="grey70",xlab="X",ylab="Y",cex=1.2,main="1-nearest neighbor")
lines(xval,knn1$fitted,col="darkblue",lwd=1)
points(xval,predlin,col="darkmagenta",cex=0.7)
plot(x,y,pch="+",col="grey70",xlab="X",ylab="Y",cex=1.2,main="24-nearest neighbors")
lines(xval,knn24$fitted,col="darkblue",lwd=3)
points(xval,predlin,col="darkmagenta",cex=0.7)
plot(x,y,pch="+",col="grey70",xlab="X",ylab="Y",cex=1.2,main="300-nearest neighbors")
lines(xval,knn300$fitted,col="darkblue",lwd=3)
points(xval,predlin,col="darkmagenta",cex=0.7)


