Documenti di Didattica
Documenti di Professioni
Documenti di Cultura
0########
#3.6.2 Simple Linear Regression
fix(Boston)
names(Boston)
attach(Boston)
#medv response, lstat predictor
lm.fit=lm(medv~lstat)
#basic information
lm.fit
#detailed information
summary(lm.fit)
#pieces of information stored in lm.fit
names(lm.fit)
terms(lm.fit)
residuals(lm.fit)
plot(residuals(lm.fit))
#confidence intervals
confint(lm.fit)
#predict function used to compute confidence/prediction intervals for a given va
lue of the predictor
predict(lm.fit,data.frame(lstat=c(5,10,15)),interval="confidence")
predict(lm.fit,data.frame(lstat=c(5,10,15)),interval="prediction")# both centere
d on the same value but the latter is wider
#plot the response (medv), predictor(lstat), and regression line
plot(lstat,medv)
abline(lm.fit)
#diagnostic plots
par(mfrow=c(2,2))
plot(lm.fit)
#residuals() returns the residuals while rstudent() returns the studentized resi
duals, which we plot against the fitted values[predict(lm.fit)]
plot(predict(lm.fit),residuals(lm.fit))
plot(predict(lm.fit),rstudent(lm.fit))
#leverage statistics
plot(hatvalues(lm.fit))
which.max(hatvalues(lm.fit))
#3.6.3 Multiple Linear Regression
lm.fit=lm(medv~lstat+age,data=Boston)
summary(lm.fit)
lm.fit=lm(medv~.,data=Boston)
summary(lm.fit)
library(car)
#variance inflation factor
vif(lm.fit)
#all predictors except age
lm.fit=lm(medv~.-age,data=Boston)
summary(lm.fit)
#3.6.4 Interaction Terms
#lstat*age=lstat+age+lstat:age <- interaction term
summary(lm(medv~lstat*age,data=Boston))
#3.6.5 Non-Linear Transformation of the Predictors
lm.fit2=lm(medv~lstat+I(lstat^2),data=Boston)
summary(lm.fit2)
#comparing models using anova
anova(lm.fit,lm.fit2)
lm.fit5=lm(medv~poly(lstat,5),data=Boston)
#log transformation of the predictors [ useful for heteroscedascity and other pr
oblems]
summary(lm(medv~log(rm),data=Boston))
#3.6.6 Qualitative Predictors
fix(Carseats)
names(Carseats)
#lm with interaction terms
lm.fit=lm(Sales~.+Income:Advertising+Price:Age,data=Carseats) #all plus interact
ion terms added
summary(lm.fit)
#contrasts() returns the coding done for dummy vars.
attach(Carseats)
contrasts(ShelveLoc)
#c
lm.fit=lm(mpg~.-name,data=Auto)
summary(lm.fit)
#i. yes, F stat is far from one and very small p value
#iii. mpg incereases by the value of the coefficient
#d
plot(lm.fit)
par(mfrow=c(1,1))
plot(predict(lm.fit),rstudent(lm.fit))
#e
lm.fit1=lm(mpg~cylinders*displacement+displacement:weight)
summary(lm.fit1)
#f
lm.fit=lm(mpg~sqrt(weight)+log(displacement)+I(cylinders^2),data=Auto)
summary(lm.fit)
##10
#a
attach(Carseats)
names(Carseats)
summary(Carseats)
fix(Carseats)
lm.fit=lm(Sales~Price+Urban+US,data=Carseats)
summary(lm.fit)
#b
# as price increases , sales decrease . significant , small p value
# the model suggests that there is no relationship between the location of the s
tore and sales
# the fact that the store is located in the US is significant. sales increase by
1200
#c
#sales=13.04-0.05*price-0.02*urbanYES+1.2*USyes
#d
#for price and usyes
#e
lm.fit2=lm(Sales~Price+US,data=Carseats)
summary(lm.fit)
#f
#similar values for R Squared and RSE
#g
confint(lm.fit2)
#h
plot(predict(lm.fit2),rstudent(lm.fit2)) # all betweeen -3 and 3 so no outliers
plot(lm.fit2) #points that exceed (p+1) / n have high leverage
dim(Carseats)
(2+1)/400
##11
#a
set.seed(1)
x=rnorm(100)
y=2*x+rnorm(100)
lm.fit=lm(y~x+0)
summary(lm.fit)
#b
lm.fit1=lm(x~y+0)
summary(lm.fit1)
#c
plot(x,y)
abline(lm.fit)
abline(lm.fit1)
#d
sqrt(length(x)-1)*sum(x*y)/sqrt(sum(x^2)*sum(y^2) - sum(x*y)^2
# t stat shown above
#f
lm.fit1=lm(x~y)
lm.fit2=lm(y~x)
summary(lm.fit1)
summary(lm.fit2)
##15
#a
library(MASS)
attach(Boston)
names(Boston)
lm.fit1=lm(crim~zn,data=Boston)
summary(lm.fit1)
#...
#b
lm.fit.all=lm(crim~.,data=Boston)
summary(lm.fit.all) # zn, age, dis, black, medv
################################################################################
#######################################################
#ala cu boxplots rezumat , apoi lab si exercitii
#Logisitc regression, LDA, QDA, KNN
################################################################################
######################################################
# PAGE 151. !!
#Both LOG REG and QDA both produce linear boundaries. The only difference lies i
n the fact that B0 and B1 are estimated using maximul
#likelyhood, whereasc c0 and c1 are computed using the estimated mean and varia
nce from a normal distribution.
# LDA assumes that the obs. are drawn from a gaussian distribution with a common
covariance matrix in each class and provides improvemen
# ts over log .reg. when this assumption holds. When the gaussian assumptions ar
e not met, log .reg. outperforms LDA.
# On the other hand KNN is expected to dominate log. reg. and LDA when the boun
dary is highly non linear because it is non-parametric
# and makes no assumption about the shape. The KNN doesnt tell which predictors
are important and doesent give a coefficients table.
# QDA serves as an intermediary between LDA+Log. Reg. and KNN because it assum
es a quadratic decision boundary , and therefore is
# more flexible.
# ##############################################################################
#######################################################
#
# #Scenario 1(linear): 20 obs in each class, uncorelated random normal obs., d
ifferent mean in each class.
#
LDA first because it assumes exactly this boundary, Log Reg after bc. i
library(ISLR)
names(Smarket)
dim(Smarket)
summary(Smarket)
pairs(Smarket)
cor(Smarket)
cor(Smarket[,-9])
attach(Smarket)
plot(Volume)
#4.6.2 Logistic Regression
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family = bino
mial)
summary(glm.fit)
coef(glm.fit)
summary(glm.fit)$coef
lda.class=lda.pred$class
table(lda.class,Direction.2005)
mean(lda.class==Direction.2005)
#apply 50% threshold to the posterior probabilities and recreate the predictions
in lda.pred$class
sum(lda.pred$posterior[,1]>=.5)
sum(lda.pred$posterior[,1]<.5)
lda.pred$posterior[1:20,1]
lda.class[1:20]
#different probability threshold
sum(lda.pred$posterior[,1]>.9)
#4.6.4 QDA
library(MASS)
qda.fit=qda(Direction~Lag1+Lag2,data=Smarket,subset=train)
qda.fit# contains the group means but not the coeffs bc QDA involves a quadratic
functions
qda.class=predict(qda.fit,Smarket.2005)$class #same as
table(qda.class,Direction.2005)
(30+121)/(30+121+81+20)
mean(qda.class==Direction.2005)
#4.6.5 KNN
# matrix containing the predictors associated with the training data, labeled tr
ain.x
# matrix containing the predictors associated with the test data, labeled test.x
# a vector containing the class labels for the training observations, labeled tr
ain.Direction
# a value for K, number of nearest neighbors
library(class)
#4.6.6 Application to caravan insurance data
attach(Caravan)
standardized.x=scale(Caravan[,-86])
test=1:1000
train.x=standardized.x[-test,]
test.x=standardized.x[test,]
train.y=Purchase[-test]
test.y=Purchase[test]
set.seed(1)
knn.pred=knn(train.x,test.x,train.y,k=1)
mean(test.y!=knn.pred)
mean(test.y!="No")
table(knn.pred,test.y)
#log reg with .25 threshold
glm.fit=glm(Purchase~.,data=Caravan,family=binomial,subset=-test)
glm.probs=predict(glm.fit,Caravan[test,],type="response")
glm.pred=rep("No",1000)
glm.pred[glm.probs>.25]="Yes"
table(glm.pred,test.y)
#4.7 Exercises
#9
library(ISLR)
attach(Weekly)
#a
pairs(Weekly) #year and volume
cor(Weekly)
summary(Weekly)
cor(Weekly[,-9])
#b
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Weekly,family=binomia
l)
summary(glm.fit) #lag2
#c
glm.probs=predict(glm.fit,type="response")
glm.pred=rep("Down",length(glm.probs))
glm.pred[glm.probs>.5]="Up"
table(glm.pred,Direction)
(41+557)/(41+557+430+37)
mean(glm.pred==Direction)
557/(48+557) #up
54/(54+430) #down
#d
train=(Year<2009)
Weekly.test=Weekly[!train,]
glm.fit=glm(Direction~Lag2,data=Weekly,subset=train,family=binomial)
glm.probs=predict(glm.fit,Weekly.test,type="response")
glm.pred=rep("Down",length(glm.probs))
glm.pred[glm.probs>.5]="Up"
Direction.test=Direction[!train]
table(glm.pred,Direction.test)
train = (Year < 2009)
Weekly.0910 = Weekly[!train, ]
glm.fit = glm(Direction ~ Lag2, data = Weekly, family = binomial, subset = train
)
glm.probs = predict(glm.fit, Weekly.0910, type = "response")
glm.pred = rep("Down", length(glm.probs))
glm.pred[glm.probs > 0.5] = "Up"
Direction.0910 = Direction[!train]
table(glm.pred, Direction.0910)
(9+56)/(9+56+34+5)
mean(glm.pred==Direction.0910)
#e
library(MASS)
lda.fit=lda(Direction~Lag2,data=Weekly,subset=train)
lda.pred=predict(lda.fit,Weekly.0910)
lda.class=lda.pred$class
table(lda.class,Direction.0910)
(9+56)/(34+56+14)
mean(lda.class==Direction.0910)
#f
qda.fit=qda(Direction~Lag2,data=Weekly,subset = train)
qda.class=predict(qda.fit,Weekly.0910)$class
table(qda.class,Direction.0910)
mean(qda.class==Direction.0910)
#g
library(class)
train.x=as.matrix(Lag2[train])
test.x=as.matrix(Lag2[!train])
train.direction=Direction[train]
set.seed(1)
knn.pred=knn(train.x,test.x,train.direction,k=1)
table(knn.pred,Direction.0910)
(21+31)/(21+31+22+30)
mean(knn.pred==Direction.0910)
#h
#lda
#i
library(class)
train.x=as.matrix(Lag2[train])
test.x=as.matrix(Lag2[!train])
train.direction=Direction[train]
set.seed(1)
knn.pred=knn(train.x,test.x,train.direction,k=15)
mean(knn.pred==Direction.0910)
#11
#a
attach(Auto)
dim=dim(Auto)[1]
mpg01=rep(0,dim)
mpg01
mpg01[mpg>median(mpg)]=1
mpg01
Auto=data.frame(Auto,mpg01)
fix(Auto)
#b
summary(Auto)
cor(Auto[,-12])
pairs(Auto)
#c
train=(year%%2==0)
test=!train
Auto.train=Auto[train,]
Auto.test=Auto[test,]
mpg01.test=mpg01[test]
#cylinders + weight + displacement + horsepower
#d
library(MASS)
lda.fit=lda(mpg01~cylinders+weight+displacement+horsepower,data=Auto,subset=trai
n)
lda.pred=predict(lda.fit,Auto.test)
mean(lda.pred!=mpg01.test) #12% test error rate
#e
qda.fit=qda(mpg01~cylinders+weight+displacement+horsepower,data=Auto,subset=trai
n)
qda.class=predict(qda.fit,Auto.test)$class
#13
library(MASS)
attach(Boston)
summary(Boston)
#??
################################################################################
#################################
#5.3.1 Validation Set Aproach
library(ISLR)
set.seed(1)
train=sample(392,196)
train
attach(Auto)
lm.fit=lm(mpg~horsepower,data=Auto,subset=train)
mean((mpg-predict(lm.fit,Auto))[-train]^2)
lm.fit2=lm(mpg~poly(horsepower,2),data=Auto,subset=train)
mean((mpg-predict(lm.fit2,Auto))[-train]^2)
lm.fit3=lm(mpg~poly(horsepower,3),data=Auto,subset=train)
mean((mpg-predict(lm.fit3,Auto))[-train]^2)
set.seed(2)
train=sample(392,196)
lm.fit=lm(mpg~horsepower,data=Auto,subset=train)
mean((mpg-predict(lm.fit,Auto))[-train]^2)
lm.fit2=lm(mpg~poly(horsepower,2),data=Auto,subset=train)
mean((mpg-predict(lm.fit2,Auto))[-train]^2)
lm.fit3=lm(mpg~poly(horsepower,3),data=Auto,subset=train)
mean((mpg-predict(lm.fit3,Auto))[-train]^2)
#5.3.2 LOOCV
#we use glm() not lm() bc. we can use cv.glm() for crossvalidation
glm.fit=glm(mpg~horsepower,data=Auto)
coef(glm.fit)
lm.fit=lm(mpg~horsepower,data=Auto)
coef(lm.fit)
#they are the same
library(boot)
glm.fit=glm(mpg~horsepower,data=Auto)
cv.err=cv.glm(Auto,glm.fit)
cv.err$delta
#delta cointains cv results
#for loop for polynomial fits
cv.error=rep(0,5)
for(i in 1:5)
{glm.fit=glm(mpg~poly(horsepower,i),data=Auto)
cv.error[i]=cv.glm(Auto,glm.fit)$delta[1]
}
cv.error
var(X)+var(Y)-2*cov(X,Y)
}
alpha.fn(Portfolio,1:100)
#this is automated using the boot() function
boot(Portfolio,alpha.fn,R=1000)
#create a function that takes a set and indices and returns slope and intercept
boot.fn=function(data,index)
{
return(coef(lm(mpg~horsepower,data=data,subset=index)))
}
boot.fn(Auto,1:392)
boot.fn(Auto,sample(392,392,replace=T))
boot(Auto,boot.fn,1000)
summary(lm(mpg~horsepower,data=data,subset=index))$coef
boot.fn=function (data ,index )
{ coefficients(lm(mpg~horsepower +I( horsepower ^2) ,data=data , subset =index
)) }
set.seed (1)
boot(Auto ,boot.fn ,1000)
##5.4 execises
#5
#a
library(ISLR)
attach(Default)
glm.fit=glm(default~income+balance,data=Default,family = binomial)
#b
FiveB= function()
{
train=sample(dim(Default)[1],dim(Default)[1]/2)
glm.fit=glm(default~income+balance,data=Default,family = binomial,subset=train
)
glm.pred=rep("No",dim(Default)[1]/2)
glm.probs=predict(glm.fit,Default[-train,],type="response")
glm.pred[glm.probs>0.5]="Yes"
) )
}
FiveB()
#c
#d
FiveB= function()
{
train=sample(dim(Default)[1],dim(Default)[1]/2)
glm.fit=glm(default~income+balance+student,data=Default,family = binomial,subs
et=train)
glm.pred=rep("No",dim(Default)[1]/2)
glm.probs=predict(glm.fit,Default[-train,],type="response")
glm.pred[glm.probs>0.5]="Yes"
return( mean ( glm.pred!=Default[-train,]$default
) )
}
FiveB()
#6
#a
library(ISLR)
attach(Default)
set.seed(1)
glm.def=glm(default~income+balance,data=Default,family = binomial)
summary(glm.def)
#b
boot.fn = function(data, index) return(coef(glm(default ~ income + balance, data
= data, family = binomial, subset = index)))
#c
library(boot)
boot(Default,boot.fn,50)
#d
#same
#8
#a
set.seed (1)
y=rnorm (100)
x=rnorm (100)
y=x-2* x^2+ rnorm (100)
#b
plot(x,y)
#c
Data=data.frame(x,y)
glm.fit=glm(y~x)
cv.glm(Data,glm.fit)$delta
glm.fit=glm(y~poly(x,2))
#d
set.seed(2)
Data=data.frame(x,y)
glm.fit=glm(y~x)
cv.glm(Data,glm.fit)$delta
glm.fit=glm(y~poly(x,2))
#9
#a
attach(Boston)
################################################################################
###################################
#6.5 Lab 1: Subset Selection Methods
library(ISLR)
attach(Hitters)
sum(is.na(Hitters))
Hitters=na.omit(Hitters)
sum(is.na(Hitters))
#regsubsets() used for best subset selection using RSS [same syntax as lm]
library(leaps)
regfit.full=regsubsets(Salary~.,data=Hitters)
summary(regfit.full)
#the function reports the best model up to eight variables but var no can be cha
nged using nvmax=...
regfit.full=regsubsets(Salary~.,data=Hitters,nvmax = 19)
reg.summary=summary(regfit.full)
reg.summary
#we can use R SQ, RSS, Adj R sq, Cp and BIC to asses the model
names(reg.summary)
reg.summary$rsq
#plot R SQ., Adj. R, Cp and BIC at once to asses the model
par(mfrow=c(2,2))
plot(reg.summary$rss,xlab="no of var",ylab="RSS",type="l")
plot(reg.summary$adjr2,xlab="no of var",ylab="adj r sq",type="l")
which.max (reg.summary$adjr2)
points(11,reg.summary$adjr2[11],col="red",cex=2,pch=20)#plot on the plot the bes
t point
plot(reg.summary$cp,xlab="no of var",ylab="CP",type="l")
which.min(reg.summary$cp)
points(10,reg.summary$cp[10],col="blue",cex=3,pch=21)
plot(reg.summary$bic,xlab="no of var",ylab="BIC",type="l")
which.min(reg.summary$bic)
points(6,reg.summary$bic[6],col="blue",cex=3,pch=21)
plot(regfit.full ,scale ="r2")
#finnaly we perform best subset on the full data set, and select the best 10 va
riable model!
reg.full=regsubsets(Salary~.,data=Hitters,nvmax=19)
coef(reg.full,10)
##CV
#we create a vector that allocates each observation to one of k=10 folds and cre
ate a matrix that stores the results
k=10
set.seed(1)
folds=sample(1:k,nrow(Hitters),replace=T)
folds
cv.errors=matrix(NA,k,19,dimnames = list(NULL,paste(1:19)))
cv.errors
#
cv.errors=matrix(NA,k,19)
cv.errors
#
#we write a loop that performs cross validation.
for(j in 1:k)
{
best.fit=regsubsets(Salary~.,data=Hitters[folds!=j,],nvmax=19)
for(i in 1:19)
{
pred=predict(best.fit,Hitters[folds==j,],id=i)
cv.errors[j,i]=mean((Hitters$Salary[folds==j]-pred )^2)
}
}
#
for(j in 1:k){
best.fit =regsubsets (Salary~.,data=Hitters [folds !=j,],
nvmax =19)
for(i in 1:19) {
pred=predict (best.fit ,Hitters [folds ==j,], id=i)
cv.errors [j,i]=mean( (Hitters$Salary[folds ==j]-pred)^2)
}
}
#
cv.errors
best.fit =regsubsets (Salary~.,data=Hitters [folds !=j,],nvmax =19)
coef(best.fit,11)
##6.6 Lab 2: Ridge Regression and the Lasso
#we use the glmnet() func to fit ridge and lasso. the function needs x as an mat
rix and y as a vector
#we remove missing values
x=model.matrix(Salary~.,Hitters)[,-1]
y=Hitters$Salary
ridge.pred=predict(ridge.mod,s=0,newx=x[test,],exact=T)
mean ( (ridge.pred-y.test)^2 )
predict(ridge.mod,s=0,exact=T,type="coefficients")[1:20,]
#instead of choosing lambda by hand we can use cross validation with the cv.glmn
et(), which performs ten fold
#validation [ no. of folds can be changed using nfolds argument.]
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=0)
plot(cv.out)
bestlam=cv.out$lambda.min
bestlam
#212 is best lambda value
#MSE for this value of lambda is
ridge.pred=predict(ridge.mod,s=bestlam,newx=x[test,])
mean( (ridge.pred-y.test)^2 )
#fit model on full data set and get coeffs
out=glmnet(x,y,alpha=0)
predict(out,type="coefficients",s=bestlam)[1:20,]
## 6.6.2 The Lasso
library(glmnet)
lasso.mod=glmnet(x[train,],y[train],alpha=1,lambda = grid)
plot(lasso.mod)
#Cv lasso
set.seed(1)
cv.out=cv.glmnet(x[train,],y[train],alpha=1)
plot(cv.out)
bestlam=cv.out$lambda.min
lasso.pred=predict(lasso.mod,s=bestlam,newx=x[test,])
mean(
(lasso.pred-y.test)^2
)
#apply on full set
out=glmnet(x,y,alpha=1,lambda=grid)
lasso.coef=predict(out,type="coefficients",s=bestlam)[1:20,]
lasso.coef
#6.7 Lab 3: PCR and PLS Regression
#6.7.1 Principal Components Regression
library(pls)
set.seed(2)
pcr.fit=pcr(Salary~.,data=Hitters,scale=T,validation="CV") # scale = T is scalin
g and CV is ten fold CV
summary(pcr.fit) # 38.31 % explained by one variable ....
## !!!!! PCR reports root MSE , so we have to square this quantity to obtain rea
l MSE
validationplot(pcr.fit,val.type="MSE") #CV scores
#perform PCR on the training data and evaluate its test performance
set.seed(1)
pcr.fit=pcr(Salary~.,data=Hitters,scale=T,subset=train,validation="CV")
validationplot(pcr.fit,val.type = "MSEP")
pcr.pred=predict(pcr.fit,x[test,],ncomp = 7)
mean( (pcr.pred-y.test)^2 )
points(3,regfit.sum$adjr2[3],cex=4,pch=33)
#CP
plot(regfit.sum$cp,xlab="no of var",ylab="CP",type="l")
which.min(regfit.sum$cp)
points(3,regfit.sum$cp[3],cex=4,pch=33)
#BIC
plot(regfit.sum$bic,xlab="no of var",ylab="bic",type="l")
which.min(regfit.sum$bic)
points(3,regfit.sum$bic[3],cex=4,pch=33)
coefficients(regfit.fwd,id=3)
regfit.bwd=regsubsets(y~poly(x,10,raw=T),data=data,nvmax=10,method="backward")
regfit.sum1=summary(regfit.bwd)
#adj r sq
plot(regfit.sum1$adjr2,xlab="no of var",ylab="adj r sq",type="l")
which.max(regfit.sum1$adjr2)
points(4,regfit.sum1$adjr2[3],cex=4,pch=33)
#CP
plot(regfit.sum1$cp,xlab="no of var",ylab="CP",type="l")
which.min(regfit.sum1$cp)
points(3,regfit.sum1$cp[3],cex=4,pch=33)
#BIC
plot(regfit.sum1$bic,xlab="no of var",ylab="bic",type="l")
which.min(regfit.sum1$bic)
points(3,regfit.sum1$bic[3],cex=4,pch=33)
coefficients(regfit.bwd,id=3)
coefficients(regfit.bwd,id=4)
#e
library(glmnet)
lasso.mod=glmnet()
xmat=model.matrix(y~poly(x,10,raw=T),data=data)[,-1]
fix(data)
cv.out=cv.glmnet(xmat,y,alpha=1)
best.lam=cv.out$lambda.min
plot(cv.out)
#fit model on the full data using best lambda found
cv.out=cv.glmnet(xmat,y,alpha=1)
predict(cv.out,s=best.lam,type="coefficients")
#f
beta7 = 7
y = beta0 + beta7 * x^7 + eps
data=data.frame(x,y)
regfit.full=regsubsets(y~poly(x,10,raw=T),data=data,nvmax = 10)
reg.summary=summary(regfit.full)
which.max(reg.summary$adjr2)
which.min(reg.summary$cp)
which.min(reg.summary$bic)
coefficients(regfit.full,id=4)
coefficients(regfit.full,id=2)
coefficients(regfit.full,id=1)
xmat=model.matrix(y~poly(x,10,raw=T),data=data)[,-1]
mod.lasso=cv.glmnet(xmat,y,alpha=1)
best.lam=mod.lasso$lambda.min
best.lam
best.model=cv.glmnet(xmat,y,alpha=1)
predict(best.model,s=best.lam,type="coefficients")
##9
#a
library(ISLR)
rm(College)
fix(College)
sum(is.na(College))
set.seed(11)
train.size=dim(College)[1]/2
train=sample(1:dim(College)[1],train.size)
dim(College)
train.size
dim(College)[1]
college.train=College[train,]
college.test=College[-train,]
#b
lm.fit=lm(Apps~.,data=college.train)
lm.pred=predict(lm.fit,college.test)
mean(
(college.test[,"Apps"]-lm.pred)^2
college.test[,"Apps"]
#c
library(glmnet)
train.mat=model.matrix(Apps~.,data=college.train)
test.mat=model.matrix(Apps~.,data=college.test)
grid=10^seq(4,-2,length=100)
ridge.mod=cv.glmnet(train.mat,college.train[,"Apps"],alpha=0,lambda=grid,thresh
= 1e-12)
lambda.best=ridge.mod$lambda.min
lambda.best
ridge.pred=predict(ridge.mod,s=lambda.best,newx = test.mat)
mean( (College.test[,"Apps"]-ridge.pred)^2 )
#d
mod.lasso=cv.glmnet(train.mat,college.train[,"Apps"],alpha=1,lambda=grid,thresh=
1e-12)
lambda.best=mod.lasso$lambda.min
lambda.min
lasso.pred=predict(mod.lasso,newx=test.mat,s=lambda.best)
mean( (College.test[,"Apps"]-lasso.pred)^2 )
#coefs
mod.lasso=glmnet(model.matrix(Apps~.,data=College),College[,"Apps"],alpha=1)
predict(mod.lasso,s=lambda.best,type="coefficients")
#e
library(pls)
cr.fit=pcr(Apps~.,data=college.train,scale=T,validation="CV")
validationplot(pcr.fit,val.type = "MSEP")
pcr.pred=predict(cr.fit,college.test,ncomp=10)
mean( (College.test[,"Apps"]-data.frame(pcr.pred))^2 )
#f
cr.fit=plsr(Apps~.,data=college.train,scale=T,validation="CV")
validationplot(pcr.fit,val.type = "MSEP")
pcr.pred=predict(cr.fit,college.test,ncomp=10)
mean( (College.test[,"Apps"]-data.frame(pcr.pred))^2 )
#g
test.avg = mean(College.test[, "Apps"])
lm.test.r2 = 1 - mean((College.test[, "Apps"] - lm.pred)^2) /mean((College.test[
, "Apps"] - test.avg)^2)
ridge.test.r2 = 1 - mean((College.test[, "Apps"] - ridge.pred)^2) /mean((College
.test[, "Apps"] - test.avg)^2)
lasso.test.r2 = 1 - mean((College.test[, "Apps"] - lasso.pred)^2) /mean((College
.test[, "Apps"] - test.avg)^2)
pcr.test.r2 = 1 - mean((College.test[, "Apps"] - data.frame(pcr.pred))^2) /mean(
(College.test[, "Apps"] - test.avg)^2)
pls.test.r2 = 1 - mean((College.test[, "Apps"] - data.frame(pls.pred))^2) /mean(
(College.test[, "Apps"] - test.avg)^2)
barplot(c(lm.test.r2, ridge.test.r2, lasso.test.r2, pcr.test.r2, pls.test.r2), c
ol="red", names.arg=c("OLS", "Ridge", "Lasso", "PCR", "PLS"), main="Test R-squar
ed")
##10
#a
set.seed(1)
n=1000
p=20
x=matrix(rnorm(n*p),n,p)
x
b=rnorm(p)
b[3]=0
b[4]=0
b[9]=0
b[19]=0
b[10]=0
eps=rnorm(p)
y=x*b+eps
plot(x)
#b
set.seed(1)
train=sample(seq(1000),100,replace=F)
seq(1000)
y.train=y[train]
y.test=y[-train]
x.train=x[train,]
x.test=x[-train,]
#c
#Perform best subset selection on the training set, and plot the
#training set MSE associated with the best model of each size.
library(leaps)
regfit.full = regsubsets(y ~ ., data = data.frame(x = x.train, y = y.train), nvm
ax = p)
val.errors = rep(NA, p)
x_cols = colnames(x, do.NULL = FALSE, prefix = "x.")
x_cols
for (i in 1:p) {
coefi = coef(regfit.full, id = i)
pred = as.matrix(x.train[, x_cols %in% names(coefi)]) %*% coefi[names(coefi) %
in% x_cols]
pred
as.matrix(x.train[,x_cols %in% names(coefi) ]) * coefi[names(coefi) %in% x_col
s]
val.errors[i] = mean((y.train - pred)^2)
}
plot(val.errors, ylab = "Training MSE", pch = 19, type = "b")
#d
val.errors = rep(NA, p)
for (i in 1:p) {
coefi = coef(regfit.full, id = i)
pred = as.matrix(x.test[, x_cols %in% names(coefi)]) %*% coefi[names(coefi) %i
n% x_cols]
val.errors[i] = mean((y.test - pred)^2)
}
plot(val.errors, ylab = "Test MSE", pch = 19, type = "b")
#e
which.min(val.errors)
#f
coef(regfit.full,id=16)
#g
##11
#a
#best subset
set.seed(1)
library(MASS)
library(leaps)
library(glmnet)
rm(Boston)
fix(Boston)
predict.regsubsets = function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
#SMOOTHING SPLINES:
similar to regression splines, but result from minimizin
g a RSS subject to a smoothness penalty
#LOCAL REGRESSION: similar to splines but the regions are allowed to overlap
#GAM's: allow the extension of the above methods in order to deal with multiple
predictors
################################################################################
###################################
# 7.8 Lab: Non-linear Modeling
library(ISLR)
attach(Wage)
#7.8.1 Polynomial Regression and Step Functions
fit=lm(wage~poly(age,4),data=Wage)
coef(summary(fit))
fit2=lm(wage~cbind(age,age^2,age^3,age^4),data=Wage)
#create a grid of values for age at which we want predictions
agelims=range(age)
agelims
age.grid=seq(from=agelims[1],to=agelims[2])
age.grid
preds=predict(fit,newdata = list(age=age.grid),se=T)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
se.bands
plot(age,wage,col="red")
title("Degree 4 polynomial",outer=T)
lines(age.grid,preds$fit,lwd=2,col="blue")
matlines(age.grid,se.bands,col="blue")
fit.1=lm(wage~age,data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5)
fit.1= lm(wage~education +age ,data=Wage)
fit.2= lm(wage~education +poly(age ,2) ,data=Wage)
fit.3= lm(wage~education +poly(age ,3) ,data=Wage)
anova(fit.1, fit.2, fit.3)
#STEP Functions
# Next we consider the task of predicting whether an individual earns more
# than $250,000 per year.
fit=glm(I(wage>250)~poly(age,4),data=Wage,family = binomial)
preds=predict(fit,newdata = list(age=age.grid),se=T)
pfit=exp(preds$fit)/(1+exp(preds$fit))
se.bands.logit=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
se.bands=exp(se.bands.logit)/(1+exp(1+se.bands.logit))
plot(age,I(wage>250),xlim=agelims,ylim=c(0,.2),type="n")
points(jitter(age),I((wage>250)/5),pch="|",col="darkgrey")
lines(age.grid,pfit,col="blue")
matlines(age.grid,se.bands,col="blue")
# We have drawn the age values corresponding to the observations with wage
# values above 250 as gray marks on the top of the plot, and those with wage
# values below 250 are shown as gray marks on the bottom of the plot.
table(cut(age,4))
fit=lm(wage~cut(age,4),data=Wage)
coef(summary(fit))
#The age<33.5 category is left out, so the intercept coefficient of
#$94,160 can be interpreted as the average salary for those under 33.5 years
#of age, and the other coefficients can be interpreted as the average additional
#salary for those in the other age groups.
#7.8.2 Splines
#fit wage to age using a regression spline. by default, cubic regression splines
are used
attach(Wage)
library(splines)
fit=lm(wage~bs(age,knots=c(25,40,60)),data=Wage)
pred=predict(fit,newdata = list(age=age.grid),se=T)
plot(age,wage,col="red")
lines(age.grid,pred$fit,lwd=4)
lines(age.grid,pred$fit+2*pred$se,lty="dashed")
lines(age.grid,pred$fit-2*pred$se,lty="dashed")
#here we have made knots at 25,40,60 which produces a spline with six basis func
tions .
# a cubic spline with three knots has seven degrees of freedom - one for interce
pt plus six basis functions
attr(bs(age,df=6),"knots")
#r chooses the splits
#bs has arg df which chooses the degree of the function rather that the default
cubic one
#NATURAL SPLINES
#ns with 4 df
fit2=lm(wage~ns(age,df=4),data=Wage)
pred2=predict(fit,newdata=list(age=age.grid),se=T)
lines(age.grid,pred2$fit,col="blue",lwd=4)
#as with the bs , we could have specified the knots dirrectly using knots
#SMOOTHING SPLINE
plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Smoothing Spline")
fit=smooth.spline(age,wage,df=16)
fit1=smooth.spline(age,wage,cv=T)
fit1$df
lines(fit,col="red")
lines(fit1,col="blue")
legend("topright",legend=c("16 DF","6.8 DF"),col=c("red","blue"),lty=1,lwd=2,cex
=.8)
#when we specified df 16 the funct calculates the value of lambda needed for 16
df
#when we selected df chosen by CV which yields a df of 6.8
#LOCAL REGRESSION
plot(age,wage,xlim=agelims,col="darkgrey")
title("Local Regression")
fit=loess(wage~age,span=.2,data=Wage)
fit1=loess(wage~age,span=.5,data=Wage)
lines(age.grid,predict(fit,newdata=data.frame(age=age.grid)))
lines(age.grid,predict(fit1,newdata=data.frame(age=age.grid)))
#local regr with spans .2 and .5 . each neighborhood consists of 20% or 50% of t
he observations.
# the longer the span the smoother the regression
#GAMS
#we fit a GAM to predict wage using natural spline functions of year and age , t
reating education as a qualitative pred
gam1=lm(wage~ns(year,4)+ns(age,5)+education,data=Wage)
#we now fit a model that uses smoothing splines, rather than natural splines .
#we need to use the gam() function
library(gam)
# s() is used for smooothing spline
gam.m3=gam(wage~s(year,4)+s(age,4)+education,data=Wage)
par(mfrow=c(1,3))
plot(gam.m3,se=T)
plot(gam1)
plot.gam(gam1,se=T)
#m1 gam that excludes year
#m2 gam that uses linear funct of year
#m3 gam that uses a spline function
gam.m1=gam(wage~s(age,5)+education,data=Wage)
gam.m2=gam(wage~year+s(age,5)+education,data=Wage)
gam.m3=gam(wage~s(year,4)+s(age,5)+education,data=Wage)
anova(gam.m1,gam.m2,gam.m3,test="F")
summary(gam.m3)
#the p values of the model reinforce the ideea that a linear model is needed for
year and a non linear one for age
#predictions on the training set
preds=predict(gam.m2,newdata=Wage)
# we can also use local regression as the building blocks of GAM with the lo() f
gam.lo=gam(wage~s(year,df=4)+lo(age,span=0.7)+education,data=Wage)
plot.gam(gam.lo,se=T)
gam.lo.i=gam(wage~lo(year,age,span=0.5),data=Wage)
library(akima)
plot(gam.lo.i)
par(mfrow=c(1,1))
#gams with log reg
gam.lr=gam(I(wage>250)~year+s(age,df=5)+education,family=binomial,data=wage)
gam.lr=gam(I(wage >250)~year+s(age ,df =5)+education ,family =binomial ,data=Wag
e)
par(mfrow =c(1,3))
plot(gam.lr,se=T)
table(education,I(wage>250))
gam.lr=gam(I(wage >250)~year+s(age ,df =5)+education ,family =binomial ,data=Wag
e,subset=(education!="1. < HS Grad"))
plot(gam.lr,se=T)
################################################################################
#####################################
#7.9 Execrise
#6
#a
set.seed(1)
library(boot)
all.deltas=rep(NA,10)
for(i in 1:10)
{
glm.fit=glm(wage~poly(age,i),data=Wage)
all.deltas[i]=cv.glm(Wage,glm.fit,K=10)$delta[2]
}
all.deltas
plot(1:10,all.deltas,xlab="degree",ylab="CV error",type="b",ylim=c(1590,1700))
min.point=min(all.deltas)
sd.point=sd(all.deltas)
abline(h=min.point+0.2*sd.point,lty="dashed")
abline(h=min.point-0.2*sd.point,lty="dashed")
legend("topright","0.2 sd line ",lty="dashed")
# 3 cuts
agelims=range(age)
agelims
age.grid=seq(from=agelims[1],to=agelims[2])
age.grid
preds=predict(fit,data.frame(age=age.grid))
lm.fit=lm(wage~poly(age,3),data=Wage)
plot(wage~age,data=Wage)
lines(age.grid,preds,col="blue",lwd=3)
fit.1=lm(wage~poly(age,1),data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
fit.6=lm(wage~poly(age,6),data=Wage)
fit.7=lm(wage~poly(age,7),data=Wage)
fit.8=lm(wage~poly(age,8),data=Wage)
fit.9=lm(wage~poly(age,9),data=Wage)
fit.10=lm(wage~poly(age,10),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5,fit.6,fit.7,fit.8,fit.9,fit.10)
#b
all.cvs=rep(NA,10)
for(i in 2:10)
{
Wage$age.cut=cut(Wage$age,i)
lm.fit=glm(wage~age.cut,data=Wage)
all.cvs[i]=cv.glm(Wage,lm.fit,K=10)$delta[2]
}
all.cvs
plot(2:10,all.cvs[-1],xlab="no of cuts",ylab="cv err",type="b")
#8 cuts
lm.fit=glm(wage~cut(age,8),data=Wage)
agelims=range(age)
agelims
age.grid=seq(from=agelims[1],to=agelims[2])
age.grid
lm.pred=predict(lm.fit,data.frame(age=age.grid))
plot(wage~age,data=Wage)
lines(age.grid,lm.pred,col="red",lwd=4)
##7
#a
set.seed(1)
summary(Wage$maritl)
plot(Wage$maritl)
summary(Wage$jobclass)
plot(Wage$jobclass)
par(mfrow=c(1,2))
plot(Wage$maritl,Wage$wage)
plot(Wage$jobclass,Wage$wage)
fit=lm(wage~maritl,data=Wage)
deviance(fit)
fit=lm(wage~jobclass,data=Wage)
deviance(fit)
fit=lm(wage~maritl+jobclass,data=Wage)
deviance(fit)
#gam
fit=gam(wage~maritl+jobclass+s(age,4),data=Wage)
deviance(fit)
##8
pairs(Auto)
#mpg inv prop to cyl displ horesp weight
cv.errs=rep(NA,10)
for (i in 1:10)
{
fit=glm(mpg~poly(displacement,i),data=Auto)
cv.errs[i]=cv.glm(Auto,fit,K=10)$delta[2]
}
cv.errs
which.min(cv.errs)
#10 th degree polynomial
attach(Auto)
plot(displacement,mpg)
lm.fit.poly=glm(mpg~poly(displacement,10),data=Auto)
summary(displacement)
disprange=range(displacement)
disprange
disp.grid=seq(from=disprange[1],to=disprange[2])
preds=predict(lm.fit.poly,data.frame(displacement=disp.grid))
lines(disp.grid,preds,col="red",lwd=5)
pol1=lm(mpg~poly(displacement,1),data=Auto)
pol2=lm(mpg~poly(displacement,2),data=Auto)
pol3=lm(mpg~poly(displacement,3),data=Auto)
pol4=lm(mpg~poly(displacement,4),data=Auto)
pol5=lm(mpg~poly(displacement,5),data=Auto)
pol6=lm(mpg~poly(displacement,6),data=Auto)
pol7=lm(mpg~poly(displacement,7),data=Auto)
pol8=lm(mpg~poly(displacement,8),data=Auto)
pol9=lm(mpg~poly(displacement,9),data=Auto)
pol10=lm(mpg~poly(displacement,10),data=Auto)
anova(pol1,pol2,pol3,pol4,pol5,pol6,pol7,pol8,pol9,pol10)
#step f
all.cvs=rep(NA,10)
for(i in 2:10)
{
Auto$dis.cut=cut(Auto$displacement,i)
lm.fit=glm(mpg~dis.cut,data=Auto)
all.cvs[i]=cv.glm(Auto,lm.fit,K=10)$delta[2]
}
all.cvs
plot(1:10,all.cvs,xlab="degree",ylab="CV error",type="b")
which.min(all.cvs)
#9 cuts
disprange=range(displacement)
disprange
disp.grid=seq(from=disprange[1],to=disprange[2])
preds=predict(lm.fit,data.frame(displacement=disp.grid))
plot(displacement,mpg)
lines(disp.grid,preds,col="red",lwd=5)
#splines
library(splines)
cv.errs=rep(NA,10)
for(df in 3:10)
{
fit=glm(mpg~ns(displacement,df=df),data=Auto)
cv.errs[df]=cv.glm(Auto,fit,K=10)$delta[2]
}
cv.errs
which.min(cv.errs)
plot(wa)
plot(displacement,mpg)
fit2=lm(mpg~ns(displacement,df=10),data=Auto)
pred2=predict(fit2,newdata=list(displacement=disp.grid),se=T)
lines(disp.grid,pred2$fit,col="blue",lwd=4)
#gams
fit = gam(mpg ~ s(displacement, 4) + s(horsepower, 4), data = Auto)
summary(fit)
##9
#a
poly.fit=lm(nox~poly(dis,3),data=Boston)
attach(Boston)
dis.range=range(dis)
dis.range
dis.grid=seq(from=dis.range[1],to=dis.range[2])
preds=predict(poly.fit,data.frame(dis=dis.grid))
plot(dis,nox)
lines(dis.grid,preds,col="red",lwd=5)
title("Pen-Pineapple-Apple-Pen")
#b
all.rss=rep(NA,10)
for (i in 1:10)
{
poly.fit=lm(nox~poly(dis,i),data=Boston)
all.rss[i]=sum(poly.fit$residuals^2)
}
all.rss
plot(1:10,all.rss,type="b")
#c
library(boot)
all.rss=rep(NA,10)
for (i in 1:10)
{
poly.fit=glm(nox~poly(dis,i),data=Boston)
all.rss[i]=cv.glm(Boston,poly.fit,K=10)$delta[2]
}
all.rss
plot(1:10,all.rss,xlab="no of deg",ylab="cv err",type="b")
which.min(all.rss)
#4 knots
#d
library(splines)
sp.fit=lm(nox~bs(dis,df=4,knots=c(4,7,11)),data=Boston)
summary(sp.fit)
sp.pred=predict(sp.fit,list(dis=dis.grid))
plot(nox~dis,data=Boston)
plot(dis,nox)
lines(dis.grid,sp.pred,col="blue",lwd=3)
#e
all.cvs=rep(NA,16)
for(i in 3:16)
{
lm.fit=lm(nox~bs(dis,df=i),data=Boston)
all.cvs[i]=sum(lm.fit$residuals^2)
}
all.cvs
which.min(all.cvs)
#f
all.cv = rep(NA, 16)
for (i in 3:16) {
lm.fit = glm(nox ~ bs(dis, df = i), data = Boston)
all.cv[i] = cv.glm(Boston, lm.fit, K = 10)$delta[2]
}
all.cv
plot(3:16,all.cv[-c(1,2)],xlab="no of var",ylab="cv err",type="b")
##10
#a
set.seed(1)
library(ISLR)
attach(College)
fix(College)
train=sample(length(Outstate),length(Outstate)/2)
test=-train
college.train=College[train,]
college.test=College[test,]
#fwd stepwise selection on training set
dim(College)
library(leaps)
regfit.fwd=regsubsets(Outstate~.,data=college.train,nvmax=17,method="forward")
reg.summary=summary(regfit.fwd)
plot(reg.summary$adjr2,xlab="no of var",ylab="adj r sq",type="l",ylim=c(0.4,0.84
))
which.max(reg.summary$adjr2)
points(13,reg.summary$adjr2[13],cex=2.5,pch=15)
max.adjr2=max(reg.summary$adjr2)
sd.adjr2=sd(reg.summary$adjr2)
abline(h=max.adjr2+0.2*sd.adjr2,lty="dashed")
abline(h=max.adjr2-0.2*sd.adjr2,lty="dashed")
max.adjr2-0.2*sd.adjr2
max.adjr2+0.2*sd.adjr2
plot(reg.summary$bic,xlab="no of var",ylab="bic",type="l")
which.min(reg.summary$bic)
points(6,reg.summary$bic[6],cex=2.5,pch=15)
sd.bic=sd(reg.summary$bic)
max.bic=max(reg.summary$bic)
abline(h=max.bic+0.2*sd.bic,lty="dashed")
abline(h=max.bic-0.2*sd.bic,lty="dashed")
max.bic+0.2*sd.bic
max.bic-0.2*sd.bic