Introduction To Statistical Learning R Labs and Exercises Code

####### 2.
0########
#3.6.2 Simple Linear Regression
fix(Boston)
names(Boston)
attach(Boston)
#medv response, lstat predictor
lm.fit=lm(medv~lstat)
#basic information
lm.fit
#detailed information
summary(lm.fit)
#pieces of information stored in lm.fit
names(lm.fit)
terms(lm.fit)
residuals(lm.fit)
plot(residuals(lm.fit))
#confidence intervals
confint(lm.fit)
#predict function used to compute confidence/prediction intervals for a given va
lue of the predictor
predict(lm.fit,data.frame(lstat=c(5,10,15)),interval="confidence")
predict(lm.fit,data.frame(lstat=c(5,10,15)),interval="prediction")# both centere
d on the same value but the latter is wider
#plot the response (medv), predictor(lstat), and regression line
plot(lstat,medv)
abline(lm.fit)
#diagnostic plots
par(mfrow=c(2,2))
plot(lm.fit)
#residuals() returns the residuals while rstudent() returns the studentized resi
duals, which we plot against the fitted values[predict(lm.fit)]
plot(predict(lm.fit),residuals(lm.fit))
plot(predict(lm.fit),rstudent(lm.fit))
#leverage statistics
plot(hatvalues(lm.fit))
which.max(hatvalues(lm.fit))
#3.6.3 Multiple Linear Regression
lm.fit=lm(medv~lstat+age,data=Boston)
summary(lm.fit)
lm.fit=lm(medv~.,data=Boston)
summary(lm.fit)
library(car)
#variance inflation factor
vif(lm.fit)
#all predictors except age
lm.fit=lm(medv~.-age,data=Boston)
summary(lm.fit)
#3.6.4 Interaction Terms
#lstat*age=lstat+age+lstat:age <- interaction term
summary(lm(medv~lstat*age,data=Boston))
#3.6.5 Non-Linear Transformation of the Predictors
lm.fit2=lm(medv~lstat+I(lstat^2),data=Boston)
summary(lm.fit2)
#comparing models using anova
anova(lm.fit,lm.fit2)
lm.fit5=lm(medv~poly(lstat,5),data=Boston)
#log transformation of the predictors [ useful for heteroscedascity and other pr
oblems]
summary(lm(medv~log(rm),data=Boston))
#3.6.6 Qualitative Predictors
fix(Carseats)
names(Carseats)
#lm with interaction terms
lm.fit=lm(Sales~.+Income:Advertising+Price:Age,data=Carseats) #all plus interact
ion terms added
summary(lm.fit)
#contrasts() returns the coding done for dummy vars.
attach(Carseats)
contrasts(ShelveLoc)
################ EXERCISES ##############

attach(Auto)
summary(Auto)
##8
#a
lm.fit=lm(mpg~horsepower,data=Auto)
summary(lm.fit)
#i: Since F statistic is far larger than 1 and the p -value of the F stat. is ve
ry small,we cana reject the null hipot.
#ii: We calculade residual error relative to the response using MEAN and RSE; me
an(mpg)=24.449 and RSE of lm.fit was
# 4.906/24.449*100=20.06%. Rsq = .6 meaning 60% of var is explained by the mod
el
#iii: Rel betweeen mpg and horsepower is negative.
#iv.
predict (lm.fit ,data.frame(horsepower=c(98)),interval ="prediction")
predict (lm.fit ,data.frame(horsepower=c(98)),interval ="confidence")
#b
plot(horsepower,mpg)
abline(lm.fit)
#c
par(mfrow=c(2,2))
plot(lm.fit)
##9
#a
pairs(Auto)
#b
cor(subset(Auto,select=-name))
#c
lm.fit=lm(mpg~.-name,data=Auto)
summary(lm.fit)
#i. yes, F stat is far from one and very small p value
#iii. mpg incereases by the value of the coefficient
#d
plot(lm.fit)
par(mfrow=c(1,1))
plot(predict(lm.fit),rstudent(lm.fit))
#e
lm.fit1=lm(mpg~cylinders*displacement+displacement:weight)
summary(lm.fit1)
#f
lm.fit=lm(mpg~sqrt(weight)+log(displacement)+I(cylinders^2),data=Auto)
summary(lm.fit)
##10
#a
attach(Carseats)
names(Carseats)
summary(Carseats)
fix(Carseats)
lm.fit=lm(Sales~Price+Urban+US,data=Carseats)
summary(lm.fit)
#b
# as price increases , sales decrease . significant , small p value
# the model suggests that there is no relationship between the location of the s
tore and sales
# the fact that the store is located in the US is significant. sales increase by
1200
#c
#sales=13.04-0.05*price-0.02*urbanYES+1.2*USyes
#d
#for price and usyes
#e
lm.fit2=lm(Sales~Price+US,data=Carseats)
summary(lm.fit)
#f
#similar values for R Squared and RSE
#g
confint(lm.fit2)
#h
plot(predict(lm.fit2),rstudent(lm.fit2)) # all betweeen -3 and 3 so no outliers
plot(lm.fit2) #points that exceed (p+1) / n have high leverage
dim(Carseats)
(2+1)/400
##11
#a
set.seed(1)
x=rnorm(100)
y=2*x+rnorm(100)
lm.fit=lm(y~x+0)
summary(lm.fit)
#b
lm.fit1=lm(x~y+0)
summary(lm.fit1)
#c
plot(x,y)
abline(lm.fit)
abline(lm.fit1)
#d
sqrt(length(x)-1)*sum(x*y)/sqrt(sum(x^2)*sum(y^2) - sum(x*y)^2
# t stat shown above
#f
lm.fit1=lm(x~y)
lm.fit2=lm(y~x)
summary(lm.fit1)
summary(lm.fit2)
##15
#a
library(MASS)
attach(Boston)
names(Boston)
lm.fit1=lm(crim~zn,data=Boston)
summary(lm.fit1)
#...
#b
lm.fit.all=lm(crim~.,data=Boston)
summary(lm.fit.all) # zn, age, dis, black, medv
################################################################################
#######################################################
#ala cu boxplots rezumat , apoi lab si exercitii
#Logisitc regression, LDA, QDA, KNN
################################################################################
######################################################
# PAGE 151. !!
#Both LOG REG and QDA both produce linear boundaries. The only difference lies i
n the fact that B0 and B1 are estimated using maximul
#likelyhood, whereasc c0 and c1 are computed using the estimated mean and varia
nce from a normal distribution.
# LDA assumes that the obs. are drawn from a gaussian distribution with a common
covariance matrix in each class and provides improvemen
# ts over log .reg. when this assumption holds. When the gaussian assumptions ar
e not met, log .reg. outperforms LDA.
# On the other hand KNN is expected to dominate log. reg. and LDA when the boun
dary is highly non linear because it is non-parametric
# and makes no assumption about the shape. The KNN doesnt tell which predictors
are important and doesent give a coefficients table.
# QDA serves as an intermediary between LDA+Log. Reg. and KNN because it assum
es a quadratic decision boundary , and therefore is
# more flexible.
# ##############################################################################
#######################################################
#
# #Scenario 1(linear): 20 obs in each class, uncorelated random normal obs., d
ifferent mean in each class.
#
LDA first because it assumes exactly this boundary, Log Reg after bc. i
t is similar. KNN performed poorly bc its

#
increace in variance was not offset by a reduction in bias. QDA perfor
med worse than LDA and LOG but better than KNN bc
#
it assumes a quadratic boundary.
#
# #Scenario 2(linear): same scenario as the first one but with cor. of -0.5 b
etween variables. Same results
#
# #Scenario 3(linear): samples x1 and x2 drawn from the t-distribution with 5
0 obs per class. due to the fact that t distr yields
#
more extreme values, even though the decision boundary
is still linear, it violates the assumptions of
#
the LDA(which assumes obs. are drawn from normal distr
ib.). Log reg best. QDA worst.
#
# #Scenario 4(non-linear): normal distribution, cor 0.5 first group, cor -0.5
second group. this corresponds to QDA which outperforms all
#
# #Scenario 5(non-linear): two classes, uncorrelated predictors, normal distri
butions. the responses were sampled from the log reg
#
using x1^2, x2^2, x1 x x2 as predictors.This corresp
onds to quadratic, which performed best followed
#
by KNN-CV, linear methods having the worst performan
ce.
# #Scenario 6(non-linear): same as previous but sampled from a more complicate
d non linear function.KNN best , followed by QDA and
#
linear methods.
## Using transformations of the predictors, we can create more flexible versions
of these methods : A more flexible version of the
# log regrssion(use x^2,x^3,x^3 as predictors);
################################################################################
######################################################
#4.6 Lab: Logistic Regression, LDA, QDA, and KNN
library(ISLR)
names(Smarket)
dim(Smarket)
summary(Smarket)
pairs(Smarket)
cor(Smarket)
cor(Smarket[,-9])
attach(Smarket)
plot(Volume)
#4.6.2 Logistic Regression
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family = bino
mial)
summary(glm.fit)
coef(glm.fit)
summary(glm.fit)$coef
#no data because it takes train set

glm.probs=predict(glm.fit,type="response")
glm.probs[1:10]
#see how the qualitative var is split
contrasts(Direction)
#convert probabilities into class labels up/ down for 0.5
glm.pred=rep("No",dim(Smarket)[1])
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction)
(145+507)/(145+507+457+141)
mean(glm.pred==Direction)
#create a train set and a test set
train=(Year<2005)
Smarket.2005=Smarket[!train,]
dim(Smarket.2005)
Direction.2005=Direction[!train]
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag5+Volume,data=Smarket,family = binomial,
subset=train)
glm.probs=predict(glm.fit,Smarket.2005,type="response")#Smarket.2005 test set
dim(Smarket[!train,])
glm.pred=rep("Down",252)
glm.probs[glm.pred>.5]="Up"
table(glm.pred,Direction.2005)
mean(glm.pred==Direction.2005)
mean(glm.pred!=Direction.2005)
#we remone vars that are not important bc their increase in variance is not met
by a reduction in bias
glm.fit=glm(Direction~Lag1+Lag2,data=Smarket,subset = train,family=binomial)
glm.probs=predict(glm.fit,Smarket.2005,type="response")
glm.pred=rep("Down",252)
table(glm.pred,Direction.2005)
#predict value of Direction for values of the two vars
predict(glm.fit,newdata=data.frame(Lag1=c(1.2,1.5),Lag2=c(1.1,-0.8)),type="respo
nse")
#4.6.3 LDA
library(MASS)
lda.fit=lda(Direction~Lag1+Lag2,data=Smarket,subset=train)
lda.fit
#group means are averages of each predictor in each class
# when the market increases, there is a tendency of the market to go down and a
tendency to go up when
# the market goes down
#the coeffs are used to create a linear combination used to form LDA
#predict() returns three elements:
# class= prediction of LDA about the movement of the market
# posterior = k th column contains the posterior probability that the obs
coresponds to the k th class
# x = linear discriminants
lda.pred=predict(lda.fit,Smarket.2005)
names(lda.pred)
lda.pred$class
lda.pred$posterior
lda.pred$x
lda.class=lda.pred$class
table(lda.class,Direction.2005)
mean(lda.class==Direction.2005)
#apply 50% threshold to the posterior probabilities and recreate the predictions
in lda.pred$class
sum(lda.pred$posterior[,1]>=.5)
sum(lda.pred$posterior[,1]<.5)
lda.pred$posterior[1:20,1]
lda.class[1:20]
#different probability threshold
sum(lda.pred$posterior[,1]>.9)
#4.6.4 QDA
library(MASS)
qda.fit=qda(Direction~Lag1+Lag2,data=Smarket,subset=train)
qda.fit# contains the group means but not the coeffs bc QDA involves a quadratic
functions
qda.class=predict(qda.fit,Smarket.2005)$class #same as
table(qda.class,Direction.2005)
(30+121)/(30+121+81+20)
mean(qda.class==Direction.2005)
#4.6.5 KNN
# matrix containing the predictors associated with the training data, labeled tr
ain.x
# matrix containing the predictors associated with the test data, labeled test.x
# a vector containing the class labels for the training observations, labeled tr
ain.Direction
# a value for K, number of nearest neighbors
library(class)
#4.6.6 Application to caravan insurance data
attach(Caravan)
standardized.x=scale(Caravan[,-86])
test=1:1000
train.x=standardized.x[-test,]
test.x=standardized.x[test,]
train.y=Purchase[-test]
test.y=Purchase[test]
set.seed(1)
knn.pred=knn(train.x,test.x,train.y,k=1)
mean(test.y!=knn.pred)
mean(test.y!="No")
table(knn.pred,test.y)
#log reg with .25 threshold
glm.fit=glm(Purchase~.,data=Caravan,family=binomial,subset=-test)
glm.probs=predict(glm.fit,Caravan[test,],type="response")
glm.pred=rep("No",1000)
glm.pred[glm.probs>.25]="Yes"
table(glm.pred,test.y)
#4.7 Exercises
#9
library(ISLR)
attach(Weekly)
#a
pairs(Weekly) #year and volume
cor(Weekly)
summary(Weekly)
cor(Weekly[,-9])
#b
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Weekly,family=binomia
l)
summary(glm.fit) #lag2
#c
glm.probs=predict(glm.fit,type="response")
glm.pred=rep("Down",length(glm.probs))
table(glm.pred,Direction)
(41+557)/(41+557+430+37)
mean(glm.pred==Direction)
557/(48+557) #up
54/(54+430) #down
#d
train=(Year<2009)
Weekly.test=Weekly[!train,]
glm.fit=glm(Direction~Lag2,data=Weekly,subset=train,family=binomial)
glm.probs=predict(glm.fit,Weekly.test,type="response")
glm.pred=rep("Down",length(glm.probs))
Direction.test=Direction[!train]
table(glm.pred,Direction.test)
train = (Year < 2009)
Weekly.0910 = Weekly[!train, ]
glm.fit = glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train
)
glm.probs = predict(glm.fit, Weekly.0910, type = "response")
glm.pred = rep("Down", length(glm.probs))
glm.pred[glm.probs > 0.5] = "Up"
Direction.0910 = Direction[!train]
table(glm.pred, Direction.0910)
(9+56)/(9+56+34+5)
#e
library(MASS)
lda.fit=lda(Direction~Lag2,data=Weekly,subset=train)
lda.pred=predict(lda.fit,Weekly.0910)
lda.class=lda.pred$class
table(lda.class,Direction.0910)
(9+56)/(34+56+14)
mean(lda.class==Direction.0910)
#f
qda.fit=qda(Direction~Lag2,data=Weekly,subset = train)
qda.class=predict(qda.fit,Weekly.0910)$class
table(qda.class,Direction.0910)
mean(qda.class==Direction.0910)
#g
library(class)
train.x=as.matrix(Lag2[train])
test.x=as.matrix(Lag2[!train])
train.direction=Direction[train]
set.seed(1)
knn.pred=knn(train.x,test.x,train.direction,k=1)
table(knn.pred,Direction.0910)
(21+31)/(21+31+22+30)
mean(knn.pred==Direction.0910)
#h
#lda
#i
library(class)
train.x=as.matrix(Lag2[train])
test.x=as.matrix(Lag2[!train])
train.direction=Direction[train]
set.seed(1)
knn.pred=knn(train.x,test.x,train.direction,k=15)
mean(knn.pred==Direction.0910)
#11
#a
attach(Auto)
dim=dim(Auto)[1]
mpg01=rep(0,dim)
mpg01
mpg01[mpg>median(mpg)]=1
mpg01
Auto=data.frame(Auto,mpg01)
fix(Auto)
#b
summary(Auto)
cor(Auto[,-12])
pairs(Auto)
#c
train=(year%%2==0)
test=!train
Auto.train=Auto[train,]
Auto.test=Auto[test,]
mpg01.test=mpg01[test]
#cylinders + weight + displacement + horsepower
#d
library(MASS)
lda.fit=lda(mpg01~cylinders+weight+displacement+horsepower,data=Auto,subset=trai
n)
lda.pred=predict(lda.fit,Auto.test)
mean(lda.pred!=mpg01.test) #12% test error rate
#e
qda.fit=qda(mpg01~cylinders+weight+displacement+horsepower,data=Auto,subset=trai
n)
qda.class=predict(qda.fit,Auto.test)$class
mean(qda.class!=mpg01.test)#13% test error rate

#f
glm.fit=glm(mpg01~cylinders+weight+displacement+horsepower,data=Auto,subset=trai
n)
glm.probs=predict(glm.fit,Auto.test)
glm.pred=rep(0,length(glm.probs))
glm.pred[glm.probs>.5]=1
mean(glm.pred!=mpg01.test)
#g
library(class)
train.x=cbind(cylinders,weight,displacement,horsepower)[train,]
test.x=cbind(cylinders,weight,displacement,horsepower)[test,]
train.mpg=mpg01[train]
knn.pred=knn(train.x,test.x,train.mpg,k=1)
mean(knn.pred!=mpg01.test)#15
mean(knn.pred!=mpg01.test)
mean(knn.pred!=mpg01.test)
mean(knn.pred!=mpg01.test)#14 best
#12
#a
power=function()
{2^3}
print(power())
#b
power2=function(x,a)
{x^a}
power2(2,3)
power2(3,8)
#c
#d
power3=function(x,a)
{
result=x^a
return(result)
}
power3(2,4)
#e
x=1:10
plot(x,power3(x,2))
#f
PlotPower = function(x, a) {
plot(x, Power3(x, a))
}
PlotPower(1:10, 3)
#13
library(MASS)
attach(Boston)
summary(Boston)
#??
################################################################################
#################################
#5.3.1 Validation Set Aproach
library(ISLR)
set.seed(1)
train=sample(392,196)
train
attach(Auto)
lm.fit=lm(mpg~horsepower,data=Auto,subset=train)
mean((mpg-predict(lm.fit,Auto))[-train]^2)
lm.fit2=lm(mpg~poly(horsepower,2),data=Auto,subset=train)
mean((mpg-predict(lm.fit2,Auto))[-train]^2)
set.seed(2)
train=sample(392,196)
lm.fit=lm(mpg~horsepower,data=Auto,subset=train)
mean((mpg-predict(lm.fit,Auto))[-train]^2)
#5.3.2 LOOCV
#we use glm() not lm() bc. we can use cv.glm() for crossvalidation
glm.fit=glm(mpg~horsepower,data=Auto)
coef(glm.fit)
lm.fit=lm(mpg~horsepower,data=Auto)
coef(lm.fit)
#they are the same
library(boot)
glm.fit=glm(mpg~horsepower,data=Auto)
cv.err=cv.glm(Auto,glm.fit)
cv.err$delta
#delta cointains cv results
#for loop for polynomial fits
cv.error=rep(0,5)
for(i in 1:5)
{glm.fit=glm(mpg~poly(horsepower,i),data=Auto)
cv.error[i]=cv.glm(Auto,glm.fit)$delta[1]
}
cv.error
#5.3.3 K-fold Cross Validations

set.seed(17)
cv.error.10=rep(0,10)
for(i in 1:10)
{
glm.fit=glm(mpg~poly(horsepower,i),data=Auto)
cv.error.10[i]=cv.glm(Auto,glm.fit,K=10)$delta[1]
}
cv.error.10
#5.3.4 The Bootstrap
alpha.fn=function(data,index)
{
X=data$X[index]
Y=data$Y[index]
return( ( var(Y)-cov(X,Y) ) /
var(X)+var(Y)-2*cov(X,Y)
}
alpha.fn(Portfolio,1:100)
#this is automated using the boot() function
boot(Portfolio,alpha.fn,R=1000)
#create a function that takes a set and indices and returns slope and intercept
boot.fn=function(data,index)
{
return(coef(lm(mpg~horsepower,data=data,subset=index)))
}
boot.fn(Auto,1:392)
boot.fn(Auto,sample(392,392,replace=T))
boot(Auto,boot.fn,1000)
summary(lm(mpg~horsepower,data=data,subset=index))$coef
boot.fn=function (data ,index )
{ coefficients(lm(mpg~horsepower +I( horsepower ^2) ,data=data , subset =index
)) }
set.seed (1)
boot(Auto ,boot.fn ,1000)
##5.4 execises
#5
#a
library(ISLR)
attach(Default)
glm.fit=glm(default~income+balance,data=Default,family = binomial)
#b
FiveB= function()
{
train=sample(dim(Default)[1],dim(Default)[1]/2)
glm.fit=glm(default~income+balance,data=Default,family = binomial,subset=train
)
glm.pred=rep("No",dim(Default)[1]/2)
glm.probs=predict(glm.fit,Default[-train,],type="response")
glm.pred[glm.probs>0.5]="Yes"
return( mean ( glm.pred!=Default[-train,]$default
) )
}
FiveB()
#c
#d
FiveB= function()
{
train=sample(dim(Default)[1],dim(Default)[1]/2)
glm.fit=glm(default~income+balance+student,data=Default,family = binomial,subs
et=train)
glm.pred=rep("No",dim(Default)[1]/2)
glm.probs=predict(glm.fit,Default[-train,],type="response")
glm.pred[glm.probs>0.5]="Yes"
return( mean ( glm.pred!=Default[-train,]$default
) )
}
FiveB()
#6
#a
library(ISLR)
attach(Default)
set.seed(1)
glm.def=glm(default~income+balance,data=Default,family = binomial)
summary(glm.def)
#b
boot.fn = function(data, index) return(coef(glm(default ~ income + balance, data
= data, family = binomial, subset = index)))
#c
library(boot)
boot(Default,boot.fn,50)
#d
#same
#8
#a
set.seed (1)
y=rnorm (100)
x=rnorm (100)
y=x-2* x^2+ rnorm (100)
#b
plot(x,y)
#c
Data=data.frame(x,y)
glm.fit=glm(y~x)
cv.glm(Data,glm.fit)$delta
glm.fit=glm(y~poly(x,2))
#d
set.seed(2)
Data=data.frame(x,y)
glm.fit=glm(y~x)
cv.glm(Data,glm.fit)$delta
glm.fit=glm(y~poly(x,2))
#9
#a
attach(Boston)
################################################################################
###################################
#6.5 Lab 1: Subset Selection Methods
library(ISLR)
attach(Hitters)
sum(is.na(Hitters))
Hitters=na.omit(Hitters)
sum(is.na(Hitters))
#regsubsets() used for best subset selection using RSS [same syntax as lm]
library(leaps)
regfit.full=regsubsets(Salary~.,data=Hitters)
summary(regfit.full)
#the function reports the best model up to eight variables but var no can be cha
nged using nvmax=...
regfit.full=regsubsets(Salary~.,data=Hitters,nvmax = 19)
reg.summary=summary(regfit.full)
reg.summary
#we can use R SQ, RSS, Adj R sq, Cp and BIC to asses the model
names(reg.summary)
reg.summary$rsq
#plot R SQ., Adj. R, Cp and BIC at once to asses the model
par(mfrow=c(2,2))
plot(reg.summary$rss,xlab="no of var",ylab="RSS",type="l")
plot(reg.summary$adjr2,xlab="no of var",ylab="adj r sq",type="l")
which.max (reg.summary$adjr2)
points(11,reg.summary$adjr2[11],col="red",cex=2,pch=20)#plot on the plot the bes
t point
plot(reg.summary$cp,xlab="no of var",ylab="CP",type="l")
which.min(reg.summary$cp)
points(10,reg.summary$cp[10],col="blue",cex=3,pch=21)
plot(reg.summary$bic,xlab="no of var",ylab="BIC",type="l")
which.min(reg.summary$bic)
points(6,reg.summary$bic[6],col="blue",cex=3,pch=21)
plot(regfit.full ,scale ="r2")
plot(regfit.full ,scale =" adjr2 ")

plot(regfit.full ,scale ="Cp")
plot(regfit.full ,scale ="bic ")
coef(regfit.full,6)
coef(regfit.full,8)
#6.5.2 Forward and Backward Stepwise Selection
#we see that the models up to six variables are identical for best subset,forwa
rd, and backward selection
#forward selection
regfit.fwd=regsubsets(Salary~.,data=Hitters,nvmax=19,method="forward")
summary(regfit.fwd)
#backward selection
regfit.bwd=regsubsets(Salary~.,data=Hitters,nvmax = 19,method="backward")
summary(regfit.bwd)
coef(regfit.full,7)
coef(regfit.fwd,7)
coef(regfit.bwd,7)
#6.5.3 Choosing Among Models Using the Validation Set Approach and Cross-Valida
tion
set.seed(1)
train=sample(c(TRUE,FALSE),nrow(Hitters),rep=T)
train
test=!train
#now apply regsubsets() to perform best subset selection
regfit.best=regsubsets(Salary~.,data=Hitters[train,],nvmax=19)
#we now compute MSE for the best model of each size
# to do this we run a loop, and for each size i we extract the coeffs from regfi
t.best for the model of that size,
# multiply them into the appropriate columns of the test model matrix[ model.mat
rix() ] , THEN compute MSE
test.mat=model.matrix(Salary~.,data=Hitters[test,])
val.errors=rep(NA,19)
for (i in 1:19)
{
coefi=coef(regfit.best,id=i)
pred=test.mat[,names(coefi)]%*%coefi
val.errors[i]=mean( (Hitters$Salary[test]-pred)^2
)
}
val.errors
which.min(val.errors)
coef(regfit.full,10)
####################
coefi=coef(regfit.best,id=2)
coefi
names(coefi)
pred=test.mat[,names(coefi)]%*%coefi
a<-test.mat[,names(coefi)]
fix(a)
fix(pred)
#####################
#finnaly we perform best subset on the full data set, and select the best 10 va
riable model!
reg.full=regsubsets(Salary~.,data=Hitters,nvmax=19)
coef(reg.full,10)
##CV
#we create a vector that allocates each observation to one of k=10 folds and cre
ate a matrix that stores the results
k=10
set.seed(1)
folds=sample(1:k,nrow(Hitters),replace=T)
folds
cv.errors=matrix(NA,k,19,dimnames = list(NULL,paste(1:19)))
cv.errors
#
cv.errors=matrix(NA,k,19)
cv.errors
#
#we write a loop that performs cross validation.
for(j in 1:k)
{
best.fit=regsubsets(Salary~.,data=Hitters[folds!=j,],nvmax=19)
for(i in 1:19)
{
pred=predict(best.fit,Hitters[folds==j,],id=i)
cv.errors[j,i]=mean((Hitters$Salary[folds==j]-pred )^2)
}
}
#
for(j in 1:k){
best.fit =regsubsets (Salary~.,data=Hitters [folds !=j,],
nvmax =19)
for(i in 1:19) {
pred=predict (best.fit ,Hitters [folds ==j,], id=i)
cv.errors [j,i]=mean( (Hitters$Salary[folds ==j]-pred)^2)
}
}
#
cv.errors
best.fit =regsubsets (Salary~.,data=Hitters [folds !=j,],nvmax =19)
coef(best.fit,11)
##6.6 Lab 2: Ridge Regression and the Lasso
#we use the glmnet() func to fit ridge and lasso. the function needs x as an mat
rix and y as a vector
#we remove missing values
x=model.matrix(Salary~.,Hitters)[,-1]
y=Hitters$Salary
#model.matrix() produces a matrix corresp to the 19 predictors as well as turnin

g all qualitative vars into dummy
# bc glmnet() takes only quantitative inputs
#6.6.1 RIDGE
#glmnet() has an arg. alpha that for 0 performs ridge and lasso for 1
library(glmnet)
grid=10^seq(10,-2,length=100)
ridge.mod=glmnet(x,y,alpha=0,lambda=grid)
#the func performs ridge reg for an automaticaly selected range of lambda. here
we have chosen to implement the fun
#ction over a range o values ranging from the null model(only the intercept) to
the least squares fit
# !! [ very high value = null model / lammbda=0 is least squares(+arg. exact=T)
]
#with each value of lambda there is a vector of ridge regr coeffs ,stored in a m
atrix.
#here we have a 20X100 matrix, with 20 rows(one for each predictor+intercept) an
d 100columns for each value of lambda
dim(coef(ridge.mod))
#the coeff estimates are much smaller in terms of l2 norm when a larger value of
lambda is used
#and bigger when a small value of lambda is used
ridge.mod$lambda[50]
coef(ridge.mod)[,50]
ridge.mod$lambda[60]
coef(ridge.mod)[,60]
#we can use the predict function to obtain ridge reg coeffs for a new value of l
ambda,say 50
predict(ridge.mod,s=50,type="coefficients")[1:20,]
# we split the data into a training set and a test set . there are two methods t
o do this
# 1. produce a random vector of true and false, and select elements coresponding
to true for the training data
# 2. randomly choose a subset of numbers between 1 and n and use them as indices
for the training observations
set.seed(1)
train=sample(1:nrow(x),nrow(x)/2)
train
test=-train
y.test=y[test]
#we fit a ridge regression on the training set and test it on the test set , usi
ng lambda =4
# in the predict() funct we replace "coefficients" with the newx argument
ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid,thresh = 1e-12)
ridge.pred=predict(ridge.mod,s=4,newx = x[test,])
mean( (ridge.pred-y.test)^2 )
#fit a model with just the intercept (very large value of lambda 1e10 = 10^10)
ridge.pred=predict(ridge.mod,s=1e10,newx=x[test,])
mean ( (ridge.pred-y.test)^2 )
# lambda = 4 leads to much lower MSE than just an intercept
# we now check if there is any benefit to using lambda = 4 instead of using a le
ast square regression
ridge.pred=predict(ridge.mod,s=0,newx=x[test,],exact=T)
mean ( (ridge.pred-y.test)^2 )
predict(ridge.mod,s=0,exact=T,type="coefficients")[1:20,]
#instead of choosing lambda by hand we can use cross validation with the cv.glmn
et(), which performs ten fold
#validation [ no. of folds can be changed using nfolds argument.]
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=0)
plot(cv.out)
bestlam=cv.out$lambda.min
bestlam
#212 is best lambda value
#MSE for this value of lambda is
ridge.pred=predict(ridge.mod,s=bestlam,newx=x[test,])
mean( (ridge.pred-y.test)^2 )
#fit model on full data set and get coeffs
out=glmnet(x,y,alpha=0)
predict(out,type="coefficients",s=bestlam)[1:20,]
## 6.6.2 The Lasso
library(glmnet)
lasso.mod=glmnet(x[train,],y[train],alpha=1,lambda = grid)
plot(lasso.mod)
#Cv lasso
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=1)
plot(cv.out)
bestlam=cv.out$lambda.min
lasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,])
mean(
(lasso.pred-y.test)^2
)
#apply on full set
out=glmnet(x,y,alpha=1,lambda=grid)
lasso.coef=predict(out,type="coefficients",s=bestlam)[1:20,]
lasso.coef
#6.7 Lab 3: PCR and PLS Regression
#6.7.1 Principal Components Regression
library(pls)
set.seed(2)
pcr.fit=pcr(Salary~.,data=Hitters,scale=T,validation="CV") # scale = T is scalin
g and CV is ten fold CV
summary(pcr.fit) # 38.31 % explained by one variable ....
## !!!!! PCR reports root MSE , so we have to square this quantity to obtain rea
l MSE
validationplot(pcr.fit,val.type="MSE") #CV scores
#perform PCR on the training data and evaluate its test performance
set.seed(1)
pcr.fit=pcr(Salary~.,data=Hitters,scale=T,subset=train,validation="CV")
validationplot(pcr.fit,val.type = "MSEP")
pcr.pred=predict(pcr.fit,x[test,],ncomp = 7)
mean( (pcr.pred-y.test)^2 )
#pcr on the full data set

pcr.fit=pcr(y~x,scale=T,ncomp=7)
summary(pcr.fit)
#6.7.2 Partial Least Squares
set.seed(1)
pls.fit=plsr(Salary~.,data=Hitters,subset=train,scale=T,validation="CV")
summary(pls.fit)
#lowest MSE when M=2
pls.pred=predict(pls.fit,x[test,],ncomp=2)
mean( (pls.pred-y.test)^2 )
#PLS on full data set with M=2
pls.fit=plsr(Salary~.,data=Hitters,subset=train,scale=T,ncomp=2)
summary(pls.fit)
###### EXERCISES ###############
##8
#a
set.seed(1)
x=rnorm(100)
eps=rnorm(100)
x
eps
#b 0=3, 1=2, 2=-3 and 3=0.3.
beta0=3
beta1=2
beta2=-3
beta3=0.3
y=beta0+beta1*x+beta2*x^2+beta3*x^3+eps
#c
data=data.frame(x,y)
fix(data)
library(leaps)
regfit.full=regsubsets(y~poly(x,10,raw=T),data=data,nvmax = 10)
#adj r sq
plot(reg.summary$adjr2,xlab="no of var",ylab="adj r sq",type="l")
which.max(reg.summary$adjr2)
points(3,reg.summary$adjr2[3],cex=4,pch=33)
#CP
plot(reg.summary$cp,xlab="no of var",ylab="cp",type="l")
points(3,reg.summary$cp[3],cex=4,pch=33)
#BIC
plot(reg.summary$bic,xlab="no of var",ylab="BIC",type="l")
points(3,reg.summary$adjr2[3],cex=4,pch=33)
coefficients(reg.full,id=3)
#d
regfit.fwd=regsubsets(y~poly(x,10,raw=T),data=data,nvmax=10,method="forward")
regfit.sum=summary(regfit.fwd)
#adj r sq
plot(regfit.sum$adjr2,xlab="no of var",ylab="adj r sq",type="l")
which.max(regfit.sum$adjr2)
points(3,regfit.sum$adjr2[3],cex=4,pch=33)
#CP
plot(regfit.sum$cp,xlab="no of var",ylab="CP",type="l")
which.min(regfit.sum$cp)
points(3,regfit.sum$cp[3],cex=4,pch=33)
#BIC
plot(regfit.sum$bic,xlab="no of var",ylab="bic",type="l")
which.min(regfit.sum$bic)
points(3,regfit.sum$bic[3],cex=4,pch=33)
coefficients(regfit.fwd,id=3)
regfit.bwd=regsubsets(y~poly(x,10,raw=T),data=data,nvmax=10,method="backward")
regfit.sum1=summary(regfit.bwd)
#adj r sq
plot(regfit.sum1$adjr2,xlab="no of var",ylab="adj r sq",type="l")
which.max(regfit.sum1$adjr2)
points(4,regfit.sum1$adjr2[3],cex=4,pch=33)
#CP
plot(regfit.sum1$cp,xlab="no of var",ylab="CP",type="l")
which.min(regfit.sum1$cp)
points(3,regfit.sum1$cp[3],cex=4,pch=33)
#BIC
plot(regfit.sum1$bic,xlab="no of var",ylab="bic",type="l")
which.min(regfit.sum1$bic)
points(3,regfit.sum1$bic[3],cex=4,pch=33)
coefficients(regfit.bwd,id=3)
coefficients(regfit.bwd,id=4)
#e
library(glmnet)
lasso.mod=glmnet()
xmat=model.matrix(y~poly(x,10,raw=T),data=data)[,-1]
fix(data)
cv.out=cv.glmnet(xmat,y,alpha=1)
best.lam=cv.out$lambda.min
plot(cv.out)
#fit model on the full data using best lambda found
cv.out=cv.glmnet(xmat,y,alpha=1)
predict(cv.out,s=best.lam,type="coefficients")
#f
beta7 = 7
y = beta0 + beta7 * x^7 + eps
data=data.frame(x,y)
regfit.full=regsubsets(y~poly(x,10,raw=T),data=data,nvmax = 10)
coefficients(regfit.full,id=4)
xmat=model.matrix(y~poly(x,10,raw=T),data=data)[,-1]
mod.lasso=cv.glmnet(xmat,y,alpha=1)
best.lam=mod.lasso$lambda.min
best.lam
best.model=cv.glmnet(xmat,y,alpha=1)
predict(best.model,s=best.lam,type="coefficients")
##9
#a
library(ISLR)
rm(College)
fix(College)
sum(is.na(College))
set.seed(11)
train.size=dim(College)[1]/2
train=sample(1:dim(College)[1],train.size)
dim(College)
train.size
dim(College)[1]
college.train=College[train,]
college.test=College[-train,]
#b
lm.fit=lm(Apps~.,data=college.train)
lm.pred=predict(lm.fit,college.test)
mean(
(college.test[,"Apps"]-lm.pred)^2
college.test[,"Apps"]
#c
library(glmnet)
train.mat=model.matrix(Apps~.,data=college.train)
test.mat=model.matrix(Apps~.,data=college.test)
grid=10^seq(4,-2,length=100)
ridge.mod=cv.glmnet(train.mat,college.train[,"Apps"],alpha=0,lambda=grid,thresh
= 1e-12)
lambda.best=ridge.mod$lambda.min
lambda.best
ridge.pred=predict(ridge.mod,s=lambda.best,newx = test.mat)
mean( (College.test[,"Apps"]-ridge.pred)^2 )
#d
mod.lasso=cv.glmnet(train.mat,college.train[,"Apps"],alpha=1,lambda=grid,thresh=
1e-12)
lambda.best=mod.lasso$lambda.min
lambda.min
lasso.pred=predict(mod.lasso,newx=test.mat,s=lambda.best)
mean( (College.test[,"Apps"]-lasso.pred)^2 )
#coefs
mod.lasso=glmnet(model.matrix(Apps~.,data=College),College[,"Apps"],alpha=1)
predict(mod.lasso,s=lambda.best,type="coefficients")
#e
library(pls)
cr.fit=pcr(Apps~.,data=college.train,scale=T,validation="CV")
pcr.pred=predict(cr.fit,college.test,ncomp=10)
mean( (College.test[,"Apps"]-data.frame(pcr.pred))^2 )
#f
cr.fit=plsr(Apps~.,data=college.train,scale=T,validation="CV")
pcr.pred=predict(cr.fit,college.test,ncomp=10)
mean( (College.test[,"Apps"]-data.frame(pcr.pred))^2 )
#g
test.avg = mean(College.test[, "Apps"])
lm.test.r2 = 1 - mean((College.test[, "Apps"] - lm.pred)^2) /mean((College.test[
, "Apps"] - test.avg)^2)
ridge.test.r2 = 1 - mean((College.test[, "Apps"] - ridge.pred)^2) /mean((College
.test[, "Apps"] - test.avg)^2)
lasso.test.r2 = 1 - mean((College.test[, "Apps"] - lasso.pred)^2) /mean((College
.test[, "Apps"] - test.avg)^2)
pcr.test.r2 = 1 - mean((College.test[, "Apps"] - data.frame(pcr.pred))^2) /mean(
(College.test[, "Apps"] - test.avg)^2)
pls.test.r2 = 1 - mean((College.test[, "Apps"] - data.frame(pls.pred))^2) /mean(
(College.test[, "Apps"] - test.avg)^2)
barplot(c(lm.test.r2, ridge.test.r2, lasso.test.r2, pcr.test.r2, pls.test.r2), c
ol="red", names.arg=c("OLS", "Ridge", "Lasso", "PCR", "PLS"), main="Test R-squar
ed")
##10
#a
set.seed(1)
n=1000
p=20
x=matrix(rnorm(n*p),n,p)
x
b=rnorm(p)
b[3]=0
b[4]=0
b[9]=0
b[19]=0
b[10]=0
eps=rnorm(p)
y=x*b+eps
plot(x)
#b
set.seed(1)
train=sample(seq(1000),100,replace=F)
seq(1000)
y.train=y[train]
y.test=y[-train]
x.train=x[train,]
x.test=x[-train,]
#c
#Perform best subset selection on the training set, and plot the
#training set MSE associated with the best model of each size.
library(leaps)
regfit.full = regsubsets(y ~ ., data = data.frame(x = x.train, y = y.train), nvm
ax = p)
val.errors = rep(NA, p)
x_cols = colnames(x, do.NULL = FALSE, prefix = "x.")
x_cols
for (i in 1:p) {
coefi = coef(regfit.full, id = i)
pred = as.matrix(x.train[, x_cols %in% names(coefi)]) %*% coefi[names(coefi) %
in% x_cols]
pred
as.matrix(x.train[,x_cols %in% names(coefi) ]) * coefi[names(coefi) %in% x_col
s]
val.errors[i] = mean((y.train - pred)^2)
}
plot(val.errors, ylab = "Training MSE", pch = 19, type = "b")
#d
val.errors = rep(NA, p)
for (i in 1:p) {
coefi = coef(regfit.full, id = i)
pred = as.matrix(x.test[, x_cols %in% names(coefi)]) %*% coefi[names(coefi) %i
n% x_cols]
val.errors[i] = mean((y.test - pred)^2)
}
plot(val.errors, ylab = "Test MSE", pch = 19, type = "b")
#e
which.min(val.errors)
#f
coef(regfit.full,id=16)
#g
##11
#a
#best subset
set.seed(1)
library(MASS)
library(leaps)
library(glmnet)
rm(Boston)
fix(Boston)
predict.regsubsets = function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)

coefi = coef(object, id = id)
mat[, names(coefi)] %*% coefi
}
k = 10
p = ncol(Boston) - 1
folds = sample(rep(1:k, length = nrow(Boston)))
cv.errors = matrix(NA, k, p)
for (i in 1:k) {
best.fit = regsubsets(crim ~ ., data = Boston[folds != i, ], nvmax = p)
for (j in 1:p) {
pred = predict(best.fit, Boston[folds == i, ], id = j)
cv.errors[i, j] = mean((Boston$crim[folds == i] - pred)^2)
}
}
rmse.cv = sqrt(apply(cv.errors, 2, mean))
plot(rmse.cv, pch = 19, type = "b")
#b
#lasso
attach(Boston)
xmat=model.matrix(crim~.-1,data=Boston)
cv.out=cv.glmnet(xmat,Boston$crim,type.measure = "mse")
plot(cv.out)
coef(cv.out)
sqrt(cv.out$cvm[cv.out$lambda==cv.out$lambda.1se] )
#ridge
cv.ridge=cv.glmnet(xmat,Boston$crim,type.measure = "mse",alpha=0)
plot(cv.ridge)
coef(cv.ridge)
sqrt( cv.ridge$cvm[cv.ridge$lambda==cv.ridge$lambda.lse])
#pcr
library(pls)
pcr.fit=pcr(crim~.,data=Boston,scale=T,validation="CV")
summary(pcr.fit)
################################################################################
###################################
#POLYNOMIAL REGRESSION : extends the linear model by adding extra predictors ,

obtained by raising each of the original
# predictors to a power. For ex. a cubic regression uses three variables , x,x^2
,x^3 as predictors.
#STEP FUNCTIONS: cut the range of a variable into K distinct regions in order to
produce a qualitative variable.
#REGRESSION SPLINES : they are an extension of polynomial and step functions . t
hey involve dividing the range of X
# into K distinct regions. within each region, a polynomial function is fit , b
ut they are constrained so that they
# join smoothly at the region boundaries (knots). provided the region is divided
into enough regions, they can provide
# an extremly flexible fit.
#SMOOTHING SPLINES:
similar to regression splines, but result from minimizin
g a RSS subject to a smoothness penalty
#LOCAL REGRESSION: similar to splines but the regions are allowed to overlap
#GAM's: allow the extension of the above methods in order to deal with multiple
predictors
################################################################################
###################################
# 7.8 Lab: Non-linear Modeling
library(ISLR)
attach(Wage)
#7.8.1 Polynomial Regression and Step Functions
fit=lm(wage~poly(age,4),data=Wage)
coef(summary(fit))
fit2=lm(wage~cbind(age,age^2,age^3,age^4),data=Wage)
#create a grid of values for age at which we want predictions
agelims=range(age)
agelims
age.grid=seq(from=agelims[1],to=agelims[2])
age.grid
preds=predict(fit,newdata = list(age=age.grid),se=T)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
se.bands
plot(age,wage,col="red")
title("Degree 4 polynomial",outer=T)
lines(age.grid,preds$fit,lwd=2,col="blue")
matlines(age.grid,se.bands,col="blue")
fit.1=lm(wage~age,data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5)
fit.1= lm(wage~education +age ,data=Wage)
fit.2= lm(wage~education +poly(age ,2) ,data=Wage)
fit.3= lm(wage~education +poly(age ,3) ,data=Wage)
anova(fit.1, fit.2, fit.3)
#STEP Functions
# Next we consider the task of predicting whether an individual earns more
# than $250,000 per year.
fit=glm(I(wage>250)~poly(age,4),data=Wage,family = binomial)
preds=predict(fit,newdata = list(age=age.grid),se=T)
pfit=exp(preds$fit)/(1+exp(preds$fit))
se.bands.logit=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
se.bands=exp(se.bands.logit)/(1+exp(1+se.bands.logit))
plot(age,I(wage>250),xlim=agelims,ylim=c(0,.2),type="n")
points(jitter(age),I((wage>250)/5),pch="|",col="darkgrey")
lines(age.grid,pfit,col="blue")
matlines(age.grid,se.bands,col="blue")
# We have drawn the age values corresponding to the observations with wage
# values above 250 as gray marks on the top of the plot, and those with wage
# values below 250 are shown as gray marks on the bottom of the plot.
table(cut(age,4))
fit=lm(wage~cut(age,4),data=Wage)
coef(summary(fit))
#The age<33.5 category is left out, so the intercept coefficient of
#$94,160 can be interpreted as the average salary for those under 33.5 years
#of age, and the other coefficients can be interpreted as the average additional
#salary for those in the other age groups.
#7.8.2 Splines
#fit wage to age using a regression spline. by default, cubic regression splines
are used
attach(Wage)
library(splines)
fit=lm(wage~bs(age,knots=c(25,40,60)),data=Wage)
pred=predict(fit,newdata = list(age=age.grid),se=T)
plot(age,wage,col="red")
lines(age.grid,pred$fit,lwd=4)
lines(age.grid,pred$fit+2*pred$se,lty="dashed")
lines(age.grid,pred$fit-2*pred$se,lty="dashed")
#here we have made knots at 25,40,60 which produces a spline with six basis func
tions .
# a cubic spline with three knots has seven degrees of freedom - one for interce
pt plus six basis functions
attr(bs(age,df=6),"knots")
#r chooses the splits
#bs has arg df which chooses the degree of the function rather that the default
cubic one
#NATURAL SPLINES
#ns with 4 df
fit2=lm(wage~ns(age,df=4),data=Wage)
pred2=predict(fit,newdata=list(age=age.grid),se=T)
lines(age.grid,pred2$fit,col="blue",lwd=4)
#as with the bs , we could have specified the knots dirrectly using knots
#SMOOTHING SPLINE
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Smoothing Spline")
fit=smooth.spline(age,wage,df=16)
fit1=smooth.spline(age,wage,cv=T)
fit1$df
lines(fit,col="red")
lines(fit1,col="blue")
legend("topright",legend=c("16 DF","6.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex
=.8)
#when we specified df 16 the funct calculates the value of lambda needed for 16
df
#when we selected df chosen by CV which yields a df of 6.8
#LOCAL REGRESSION
plot(age,wage,xlim=agelims,col="darkgrey")
title("Local Regression")
fit=loess(wage~age,span=.2,data=Wage)
fit1=loess(wage~age,span=.5,data=Wage)
lines(age.grid,predict(fit,newdata=data.frame(age=age.grid)))
lines(age.grid,predict(fit1,newdata=data.frame(age=age.grid)))
#local regr with spans .2 and .5 . each neighborhood consists of 20% or 50% of t
he observations.
# the longer the span the smoother the regression
#GAMS
#we fit a GAM to predict wage using natural spline functions of year and age , t
reating education as a qualitative pred
gam1=lm(wage~ns(year,4)+ns(age,5)+education,data=Wage)
#we now fit a model that uses smoothing splines, rather than natural splines .
#we need to use the gam() function
library(gam)
# s() is used for smooothing spline
gam.m3=gam(wage~s(year,4)+s(age,4)+education,data=Wage)
par(mfrow=c(1,3))
plot(gam.m3,se=T)
plot(gam1)
plot.gam(gam1,se=T)
#m1 gam that excludes year
#m2 gam that uses linear funct of year
#m3 gam that uses a spline function
gam.m1=gam(wage~s(age,5)+education,data=Wage)
gam.m2=gam(wage~year+s(age,5)+education,data=Wage)
gam.m3=gam(wage~s(year,4)+s(age,5)+education,data=Wage)
anova(gam.m1,gam.m2,gam.m3,test="F")
summary(gam.m3)
#the p values of the model reinforce the ideea that a linear model is needed for
year and a non linear one for age
#predictions on the training set
preds=predict(gam.m2,newdata=Wage)
# we can also use local regression as the building blocks of GAM with the lo() f
gam.lo=gam(wage~s(year,df=4)+lo(age,span=0.7)+education,data=Wage)
plot.gam(gam.lo,se=T)
gam.lo.i=gam(wage~lo(year,age,span=0.5),data=Wage)
library(akima)
plot(gam.lo.i)
par(mfrow=c(1,1))
#gams with log reg
gam.lr=gam(I(wage>250)~year+s(age,df=5)+education,family=binomial,data=wage)
gam.lr=gam(I(wage >250)~year+s(age ,df =5)+education ,family =binomial ,data=Wag
e)
par(mfrow =c(1,3))
plot(gam.lr,se=T)
table(education,I(wage>250))
gam.lr=gam(I(wage >250)~year+s(age ,df =5)+education ,family =binomial ,data=Wag
e,subset=(education!="1. < HS Grad"))
plot(gam.lr,se=T)
################################################################################
#####################################
#7.9 Execrise
#6
#a
set.seed(1)
library(boot)
all.deltas=rep(NA,10)
for(i in 1:10)
{
glm.fit=glm(wage~poly(age,i),data=Wage)
all.deltas[i]=cv.glm(Wage,glm.fit,K=10)$delta[2]
}
all.deltas
plot(1:10,all.deltas,xlab="degree",ylab="CV error",type="b",ylim=c(1590,1700))
min.point=min(all.deltas)
sd.point=sd(all.deltas)
abline(h=min.point+0.2*sd.point,lty="dashed")
abline(h=min.point-0.2*sd.point,lty="dashed")
legend("topright","0.2 sd line ",lty="dashed")
# 3 cuts
agelims=range(age)
agelims
age.grid
preds=predict(fit,data.frame(age=age.grid))
lm.fit=lm(wage~poly(age,3),data=Wage)
plot(wage~age,data=Wage)
lines(age.grid,preds,col="blue",lwd=3)
anova(fit.1,fit.2,fit.3,fit.4,fit.5,fit.6,fit.7,fit.8,fit.9,fit.10)
#b
all.cvs=rep(NA,10)
for(i in 2:10)
{
Wage$age.cut=cut(Wage$age,i)
lm.fit=glm(wage~age.cut,data=Wage)
all.cvs[i]=cv.glm(Wage,lm.fit,K=10)$delta[2]
}
all.cvs
plot(2:10,all.cvs[-1],xlab="no of cuts",ylab="cv err",type="b")
#8 cuts
lm.fit=glm(wage~cut(age,8),data=Wage)
agelims=range(age)
agelims
age.grid
lm.pred=predict(lm.fit,data.frame(age=age.grid))
plot(wage~age,data=Wage)
lines(age.grid,lm.pred,col="red",lwd=4)
##7
#a
set.seed(1)
summary(Wage$maritl)
plot(Wage$maritl)
summary(Wage$jobclass)
plot(Wage$jobclass)
par(mfrow=c(1,2))
plot(Wage$maritl,Wage$wage)
plot(Wage$jobclass,Wage$wage)
fit=lm(wage~maritl,data=Wage)
deviance(fit)
fit=lm(wage~jobclass,data=Wage)
deviance(fit)
fit=lm(wage~maritl+jobclass,data=Wage)
deviance(fit)
#gam
fit=gam(wage~maritl+jobclass+s(age,4),data=Wage)
deviance(fit)
##8
pairs(Auto)
#mpg inv prop to cyl displ horesp weight
cv.errs=rep(NA,10)
for (i in 1:10)
{
fit=glm(mpg~poly(displacement,i),data=Auto)
cv.errs[i]=cv.glm(Auto,fit,K=10)$delta[2]
}
cv.errs
which.min(cv.errs)
#10 th degree polynomial
attach(Auto)
plot(displacement,mpg)
lm.fit.poly=glm(mpg~poly(displacement,10),data=Auto)
summary(displacement)
disprange=range(displacement)
disprange
disp.grid=seq(from=disprange[1],to=disprange[2])
preds=predict(lm.fit.poly,data.frame(displacement=disp.grid))
lines(disp.grid,preds,col="red",lwd=5)
pol1=lm(mpg~poly(displacement,1),data=Auto)
anova(pol1,pol2,pol3,pol4,pol5,pol6,pol7,pol8,pol9,pol10)
#step f
all.cvs=rep(NA,10)
for(i in 2:10)
{
Auto$dis.cut=cut(Auto$displacement,i)
lm.fit=glm(mpg~dis.cut,data=Auto)
all.cvs[i]=cv.glm(Auto,lm.fit,K=10)$delta[2]
}
all.cvs
plot(1:10,all.cvs,xlab="degree",ylab="CV error",type="b")
which.min(all.cvs)
#9 cuts
disprange=range(displacement)
disprange
disp.grid=seq(from=disprange[1],to=disprange[2])
preds=predict(lm.fit,data.frame(displacement=disp.grid))
lines(disp.grid,preds,col="red",lwd=5)
#splines
library(splines)
cv.errs=rep(NA,10)
for(df in 3:10)
{
fit=glm(mpg~ns(displacement,df=df),data=Auto)
cv.errs[df]=cv.glm(Auto,fit,K=10)$delta[2]
}
cv.errs
which.min(cv.errs)
plot(wa)
fit2=lm(mpg~ns(displacement,df=10),data=Auto)
pred2=predict(fit2,newdata=list(displacement=disp.grid),se=T)
lines(disp.grid,pred2$fit,col="blue",lwd=4)
#gams
fit = gam(mpg ~ s(displacement, 4) + s(horsepower, 4), data = Auto)
summary(fit)
##9
#a
poly.fit=lm(nox~poly(dis,3),data=Boston)
attach(Boston)
dis.range=range(dis)
dis.range
dis.grid=seq(from=dis.range[1],to=dis.range[2])
preds=predict(poly.fit,data.frame(dis=dis.grid))
plot(dis,nox)
lines(dis.grid,preds,col="red",lwd=5)
title("Pen-Pineapple-Apple-Pen")
#b
all.rss=rep(NA,10)
for (i in 1:10)
{
poly.fit=lm(nox~poly(dis,i),data=Boston)
all.rss[i]=sum(poly.fit$residuals^2)
}
all.rss
plot(1:10,all.rss,type="b")
#c
library(boot)
all.rss=rep(NA,10)
for (i in 1:10)
{
poly.fit=glm(nox~poly(dis,i),data=Boston)
all.rss[i]=cv.glm(Boston,poly.fit,K=10)$delta[2]
}
all.rss
plot(1:10,all.rss,xlab="no of deg",ylab="cv err",type="b")
which.min(all.rss)
#4 knots
#d
library(splines)
sp.fit=lm(nox~bs(dis,df=4,knots=c(4,7,11)),data=Boston)
summary(sp.fit)
sp.pred=predict(sp.fit,list(dis=dis.grid))
plot(nox~dis,data=Boston)
plot(dis,nox)
lines(dis.grid,sp.pred,col="blue",lwd=3)
#e
all.cvs=rep(NA,16)
for(i in 3:16)
{
lm.fit=lm(nox~bs(dis,df=i),data=Boston)
all.cvs[i]=sum(lm.fit$residuals^2)
}
all.cvs
which.min(all.cvs)
#f
all.cv = rep(NA, 16)
for (i in 3:16) {
lm.fit = glm(nox ~ bs(dis, df = i), data = Boston)
all.cv[i] = cv.glm(Boston, lm.fit, K = 10)$delta[2]
}
all.cv
plot(3:16,all.cv[-c(1,2)],xlab="no of var",ylab="cv err",type="b")
##10
#a
set.seed(1)
library(ISLR)
attach(College)
fix(College)
train=sample(length(Outstate),length(Outstate)/2)
test=-train
college.train=College[train,]
college.test=College[test,]
#fwd stepwise selection on training set
dim(College)
library(leaps)
regfit.fwd=regsubsets(Outstate~.,data=college.train,nvmax=17,method="forward")
reg.summary=summary(regfit.fwd)
plot(reg.summary$adjr2,xlab="no of var",ylab="adj r sq",type="l",ylim=c(0.4,0.84
))
points(13,reg.summary$adjr2[13],cex=2.5,pch=15)
max.adjr2=max(reg.summary$adjr2)
sd.adjr2=sd(reg.summary$adjr2)
abline(h=max.adjr2+0.2*sd.adjr2,lty="dashed")
abline(h=max.adjr2-0.2*sd.adjr2,lty="dashed")
max.adjr2-0.2*sd.adjr2
max.adjr2+0.2*sd.adjr2
plot(reg.summary$bic,xlab="no of var",ylab="bic",type="l")
points(6,reg.summary$bic[6],cex=2.5,pch=15)
sd.bic=sd(reg.summary$bic)
max.bic=max(reg.summary$bic)
abline(h=max.bic+0.2*sd.bic,lty="dashed")
abline(h=max.bic-0.2*sd.bic,lty="dashed")
max.bic+0.2*sd.bic
max.bic-0.2*sd.bic
plot(reg.summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")

min.cp = min(reg.summary$cp)
std.cp = sd(reg.summary$cp)
abline(h = min.cp + 0.2 * std.cp, col = "red", lty = 2)
abline(h = min.cp - 0.2 * std.cp, col = "red", lty = 2)
#6 vars so id = 6
reg.fit=regsubsets(Outstate~.,data=college.train,method="forward")
coefi=coef(reg.fit,id=6)
names(coefi)
#b
library(gam)
gam.fit=gam(Outstate~Private+s(Room.Board,df=2)+s(PhD,df=2)+s(perc.alumni,df=2)+
s(Expend,df=2)+s(Grad.Rate,df=2),data=college.train)
par(mfrow = c(2, 3))
plot(gam.fit, se = T, col = "blue")
#c
gam.pred=predict(gam.fit,college.test)
gamm.err=mean(
(college.test$Outstate-gam.pred)^2 )
gam.err
gam.tss = mean((College.test$Outstate - mean(College.test$Outstate))^2)
test.rss = 1 - gam.err/gam.tss
test.rss
#d
summary(gam.fit)
################################################################################
########################################

Introduction To Statistical Learning R Labs and Exercises Code

Caricato da

Informazioni sul documento

Titolo originale

Copyright

Formati disponibili

Condividi questo documento

Condividi o incorpora il documento

Opzioni di condivisione

Hai trovato utile questo documento?

Questo contenuto è inappropriato?

Copyright:

Formati disponibili

Introduction To Statistical Learning R Labs and Exercises Code

Caricato da

Copyright:

Formati disponibili

####### 2.

################ EXERCISES ##############

t is similar. KNN performed poorly bc its

#no data because it takes train set

mean(qda.class!=mpg01.test)#13% test error rate

#5.3.3 K-fold Cross Validations

return( mean ( glm.pred!=Default[-train,]$default

plot(regfit.full ,scale =" adjr2 ")

#model.matrix() produces a matrix corresp to the 19 predictors as well as turnin

#pcr on the full data set

mat = model.matrix(form, newdata)

#POLYNOMIAL REGRESSION : extends the linear model by adding extra predictors ,

plot(reg.summary$cp, xlab = "Number of Variables", ylab = "Cp", type = "l")

Potrebbero piacerti anche