# # load the quantreg and kernel smoothing libraries library library(KernSmooth) library(quantreg) # # read the weekly gas price data - 8/20/90 to 4/02/07 price<-read.table(file="C:/Documents and Settings/Andrew Buck/My Documents/Econ 616/gasprices.csv", header = TRUE, sep = ",", dec = ".", na.strings = "NA",check.names = TRUE) # put the price data in lower case x x <- price[,2] # what is the length of the series n <- length(x) # set up the distributed lags in upper case X. p <- 5 X <- cbind(x[(p - 1):(n - 1)], x[(p - 2):(n - 2)], x[(p - 3):(n - 3)], x[(p - 4):(n - 4)]) # put current price in y y <- x[p:n] # # Assignment 1. ____________________________________________________________________________________ # # Instead of yusing the data in the assignment I am using the gas price data. The hwk assignment exercises # will be done using the residuals from an AR process # Plot this weeks price against last weeks price plot(X[,1],y,col="blue",cex=0.25,xlab="Last Week's Price",ylab="This Week's Price") # Put in the some quantiles abline(rq(y ~ X[,1], tau = 0.1), col = "green") abline(rq(y ~ X[,1], tau = 0.5), col = "green") abline(rq(y ~ X[,1], tau = 0.9), col = "green") qreg<-rq(y~X[,1]+X[,2]+X[,3]+X[,4],tau=0.5) r1<-resid(qreg) plot(r1) # (a) # (ii) set.seed(321) samp100<-sample(r1,100) samp015<-samp100[1:15] samp020<-samp100[1:20] samp050<-samp100[1:50] # # (b) (i) # compare histogram and density par(mfrow=c(2,1)) hist(samp015) plot(density(samp015)) # # (b) (ii) # compare effect of sample size par(mfrow=c(2,2)) plot(density(samp015)) plot(density(samp020)) plot(density(samp050)) plot(density(samp100)) # # (c) (i) # ___________ bin width par(mfrow=c(2,2)) rmin<-min(r1) rmax<-max(r1) hist(samp050,br=seq(rmin,rmax,10)) hist(samp050,br=seq(rmin,rmax,5)) hist(samp050,br=seq(rmin,rmax,2)) hist(samp050,br=seq(rmin,rmax,1)) # #(c) (i) # ____________ band width plot(density(samp100,bw=0.1)) plot(density(samp100,bw=0.2)) plot(density(samp100,bw=0.5)) plot(density(samp100,bw=1.0)) # # (c) (ii) # ____________ kernel par(mfrow=c(3,2)) plot(density(samp100, kernel = "gaussian")) plot(density(samp100, kernel = "epanechnikov")) plot(density(samp100, kernel = "triangular")) plot(density(samp100, kernel = "biweight")) plot(density(samp100, kernel = "cosine")) plot(density(samp100, kernel = "optcosine")) # # (c) (iii) # ______________ number of arguments par(mfrow=c(2,2)) plot(density(samp100,n=4)) plot(density(samp100,n=8)) plot(density(samp100,n=16)) plot(density(samp100,n=1024)) # # Assinment 2. ______________________________________________________________ # # (a) Familiarize yourself with some funcitons. You should have used similar funcitons in other software. # The only new one should be approxfun, which does a linear approximaiton on a set of (x,y) pairs. # credit<-read.csv("C:/Documents and Settings/Andrew Buck/My Documents/Econ 616/Kredit.csv",header=T) # _____________________ separate the good guys from the bad credit guys c0<-credit[credit[,1]==0,2]/1000 c1<-credit[credit[,1]==1,2]/1000 # # (b) (i) # _____________________ plot a histogram and the density for both groups and do the density with restricted support # Just for fun I did this for both groups # par(mfrow=c(2,2)) temp0<-density(c0) hist(c0,prob=T,ylim=c(0,0.3)) lines(temp0$x,temp0$y,col="red") # temp0<-density(c0,from=0,to=20) hist(c0,prob=T,ylim=c(0,0.3)) lines(temp0$x,temp0$y,col="green") temp1<-density(c1) hist(c1,prob=T,ylim=c(0,0.4)) lines(temp1$x,temp1$y,col="red") temp1<-density(c1,from=0,to=20) hist(c1,prob=T,ylim=c(0,0.4)) lines(temp1$x,temp1$y,col="green") # # (b) (ii) # ____________________ estimate the area under the density for c0 # The first line gets interpolated y's for a set of x's dc0<-approxfun(temp0$x,temp0$y,yleft=0,yright=0) # The area is approximated by a bunch of rectangles under the curve. del<-0.005 x<-seq(0,50,del) y<-dc0(x) area<-sum(y)*del # # (b) (iii) # # Rescale so that the density integrates to one ty<-y/area dc0<-approxfun(x,ty,yleft=0,yright=0) temp0<-density(c0,from=0,to=20) par(mfrow=c(2,2)) # Draw the histo and density again _________________ hist(c0,prob=T,ylim=c(0,0.25)) lines(temp0$x,temp0$y,col="green") # Put in the rescaled empirical density ________________________ curve(dc0, 0,50, n = 101, add = T, type = "l",col="blue") # Plot the CDF tyc<-cumsum(ty)*del pc0<-approxfun(x,tyc,yleft=0,yright=1) curve(pc0, 0,20, n = 101, type = "l",col="blue") # Plot the inverse of the CDF qc0<-approxfun(tyc[!duplicated(tyc)],x[!duplicated(tyc)],yleft=-Inf,yright=+Inf) curve(qc0, 0,1, n = 101, type = "l",col="blue") # Generate some random numbers from the CDF and plot the to see if they mimic the data we started with rc0<-function(n){qc0(runif(n))} x<-rc0(1000) hist(x,prob=T,ylim=c(0,0.3)) curve(dc0, 0,50, n = 101, add = T, type = "l",col="blue", ylim=c(0,0.3)) # # _________________________ Assignment 3 # # The given function # A mixture of normals __________________________________ dmnorm <- function(x,alpha,mean,sd) { m<-length(alpha) pdf<-rep(0,length(x)) for(i in 1:m) { pdf<-pdf+alpha[i]*dnorm(x,mean[i],sd[i]) } return(pdf) } # (a) (i) # Do the particular example # alpha<-c(.4,.3,.3) mu<-c(10,20,30) sig<-c(2,3,2) x<-c(300:3600/100) dens<-dmnorm(x,alpha,mu,sig) plot(x,dens,type="l") # # (a) (ii) # A random number generator for the mixture # rmnorm<-function(n,alpha,mean,sd) { m<-length(alpha) ind<-sample(1:m,n,replace=T,prob=alpha) rx<-rnorm(n,mean[ind],sd[ind]) return(rx) } rx<-rmnorm(1000,alpha,mu,sig) hist(rx,prob=T,ylim=c(0,0.1),nclass=20) x<-seq(0,40,length=100) pdf<-dmnorm(x,alpha,mu,sig) lines(x,pdf,type="l",col="red") # # (b) # bandwidth experiments # library(sm) par(mfrow=c(2,2)) x<-seq(0,40,length=100) pdf<-dmnorm(x,alpha,mu,sig) set.seed(1) n<-50 rx<-rmnorm(n,alpha,mu,sig) hist(rx,prob=T,ylim=c(0,0.1)) lines(x,pdf,type="l",col="red") sm.density(rx,hnorm(rx),ylim=c(0,0.1)) lines(x,pdf,type="l",col="red") sm.density(rx,hcv(rx),ylim=c(0,0.1)) lines(x,pdf,type="l",col="red") sm.density(rx,hsj(rx),ylim=c(0,0.1)) lines(x,pdf,type="l",col="red") # # (c) # Repeat (b) with other sample sizes. I didn't do this here. # # (d) # I couldn't find the tree data that Zucchini had in mind so I am using a dataset already in R # __________________________________Trees attach(trees) par(mfrow=c(2,2)) hist(Height,prob=T) sm.density(Height,hnorm(Height),col="red") sm.density(Height,hsj(Height),col="blue") sm.density(Height,hcv(Height),col="green") # _______________________________________________________ Assignment Four # data<-matrix(scan("E:/Kernel/strength.txt"),ncol=4,byrow=T) colnames(data)<-c("grip","arm","rating","sims") n<-nrow(data) # # (a) (i) # See what some commands do # # (a) (ii) # put columns of data into vectors appropriately named ______________ pairs(data) grip<-data[,1] arm<-data[,2] rating<-data[,3] sims<-data[,4] # # (a) (iii) # Explore some more comands # cbind(grip,arm) # creates a matrix whose columns are grip and arm cbind(1,arm) # creates a matrix whose first column contains ones and second arm rbind(grip,arm) # creates a matrix whose rows are grip and arm diag(1,3) # creates a 3x3 unit matrix diag(c(1,3)) # creates a 2x2 diagonal matrix whith entries 1 and 3. # # (b) # Write an R function wls(y,x,w) that computes the parameters of the weighted least squares regression coefficients. # # The function input: # # y is a vector of length n containing the target variables # x a matrix of explanatory variables (with n rows) # w a vector of length n of weights wls<-function(y,x,w) { X<-cbind(1,x) W<-diag(w) theta<-solve(t(X)%*%W%*%X)%*%t(X)%*%W%*%y return(theta) } # Set up the weights (all equal to 1 start with.) w<-rep(1,n) # Write the covariates grip and arm in a matrix. x<-cbind(grip,arm) # Apply your function to estimate the regression coefficients theta<-wls(rating,x,w) theta # Compare the result with the function lsfit fitmod<-lsfit(x,rating,w) coefs<-fitmod$coeff coefs resids<-fitmod$residuals hist(resids) qqnorm(resids) qqline(resids) # Exploring weighted least squares for a simple univariate case # (c) (i) Estimate using equal weights w<-rep(1,n) theta<-wls(grip,arm,w) # Plot the data and the fitted line x0<-c(0,200) y0<-theta[1]+theta[2]*x0 plot(arm,grip,xlim=x0,ylim=y0) lines(x0,y0) # (c) (ii) Raise the weights of the observations with 100< arm < 200 and reestimate the parameters w[100 df =",round(cars.spl$df,1)), "s( * , df = 10)"), col = c("blue","red"), lty = 1:2, bg='bisque') detach() # # 2(a) # # The function genmod(x,alpha,beta,sigma)generates realizations of y # from the given model. # genmod<-function(x,alpha,beta,sigma) { foo<-2*pi/100 n<-length(x) y<-alpha*cos(foo*x)+beta*sin(foo*x)+rnorm(n,0,sigma) return(y) } # # 2. (b) # # Generate a sample for the given parameter values # Sorting the x-values makes it easier to plot later set.seed(1) n<-100 alpha<-10 beta<-5 sigma<-1 x<-sort(runif(n,0,100)) y<-genmod(x,alpha,beta,sigma) # # 2. (c) # # Compute the true conditional mean yt<-genmod(x,alpha,beta,sigma=0) # Four different smoothing splines conditional on degrees of freedom par(mfrow=c(2,2)) # default df, computed using cross validation yfit<-smooth.spline(x,y) plot(x,y) # plot the actual data title(paste("Degrees of freedom=",round(yfit$df,1))) # plot the true line in red then the fitted line in blue lines(x,yt,col="red",lwd=3) lines(yfit,col="blue",lwd=3) # 3 df df<-3 yfit<-smooth.spline(x,y,df=df) plot(x,y) title(paste("Degrees of freedom=",round(yfit$df,1))) lines(x,yt,col="red",lwd=3) lines(yfit,col="blue",lwd=3) # 4 df df<-4 yfit<-smooth.spline(x,y,df=df) plot(x,y) title(paste("Degrees of freedom=",round(yfit$df,1))) lines(x,yt,col="red",lwd=3) lines(yfit,col="blue",lwd=3) # 20 df df<-30 yfit<-smooth.spline(x,y,df=df) plot(x,y) title(paste("Degrees of freedom=",round(yfit$df,1))) lines(x,yt,col="red",lwd=3) lines(yfit,col="blue",lwd=3) # # 2(d) # # I'll just see how changing sigma effects the fits. # You should experiment with the other parameters too (especially n). set.seed(1) sigma<-10 x<-sort(runif(n,0,100)) y<-genmod(x,alpha,beta,sigma) # Compute the true mean yt<-genmod(x,alpha,beta,sigma=0) # Four different smoothing splines par(mfrow=c(2,2)) # default df, computed using cross validation yfit<-smooth.spline(x,y) plot(x,y) title(paste("Degrees of freedom=",round(yfit$df,1))) lines(x,yt,col="red",lwd=3) lines(yfit,col="blue",lwd=3) # 3 df df<-3 yfit<-smooth.spline(x,y,df=df) plot(x,y) title(paste("Degrees of freedom=",round(yfit$df,1))) lines(x,yt,col="red",lwd=3) lines(yfit,col="blue",lwd=3) # 4 df df<-4 yfit<-smooth.spline(x,y,df=df) plot(x,y) title(paste("Degrees of freedom=",round(yfit$df,1))) lines(x,yt,col="red",lwd=3) lines(yfit,col="blue",lwd=3) # 20 df df<-30 yfit<-smooth.spline(x,y,df=df) plot(x,y) title(paste("Degrees of freedom=",round(yfit$df,1))) lines(x,yt,col="red",lwd=3) lines(yfit,col="blue",lwd=3) # # 3. # X<-matrix(scan("E:/Kernel/corruption.txt"),ncol=6,byrow=T) cn<- scan("E:/kernel/country_abbrev.dat",what="character") vnames<-c("CPI","Investment / GDP","Government deficit / GDP","GDP / Population in 1970","Reserve / GDP","Fuels & minerals / GDP") # library(sm) y<-dat[,1] x<-dat[,4] ind<-complete.cases(x,y) ind y<-y[ind] x<-x[ind] # Now that we have a set of data with the missing data dropped # par(mfrow=c(3,2)) # Use sm.regression to assess the relationship # Two different bandwidths # yfitsm<-sm.regression(x,y,h=5.0,plot=T) yfitsm<-sm.regression(x,y,h=15.0,plot=T) # # Now try smooth.spline # yfitsm<-smooth.spline(x,y,df=2) plot(x,y,cex=0.5,main="Spline, df=2") lines(yfitsm) yfitsm<-smooth.spline(x,y,df=20) plot(x,y,cex=0.5,main="Spline, df=20") lines(yfitsm) # # Now try loess, locally weighted polynomial regression # yfitl<-lowess(x,y) plot(x,y,cex=0.5,main="Lowess, f=default") lines(yfitl) yfitl<-lowess(x,y,f=0.2) plot(x,y,cex=0.5,main="Lowess, f=2") lines(yfitl) # f the smoother span. This gives the proportion of points in the plot which influence the smooth # at each value. Larger values give more smoothness # # # # ___________________ Assignment 7 ____________________________________ # # # 1.Jitter adds a small amount of noise to a data vector. # gam is a generalized additive smoothing model in mgcv. An example follows # library(mgcv) set.seed(0) n<-400 sig<-2 x0 <- runif(n, 0, 1) x1 <- runif(n, 0, 1) x2 <- runif(n, 0, 1) x3 <- runif(n, 0, 1) f0 <- function(x) 2 * sin(pi * x) f1 <- function(x) exp(2 * x) f2 <- function(x) 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10 f3 <- function(x) 0*x f <- f0(x0) + f1(x1) + f2(x2) e <- rnorm(n, 0, sig) y <- f + e b<-gam(y~s(x0)+s(x1)+s(x2)+s(x3)) summary(b) plot(b,pages=1,residuals=TRUE) # The plot shows the four planes # # 2. The problem set calls for a data set that I do not have, so we'll make our own # based on the first question. # set.seed(0) n<-100 sig<-2 x0 <- runif(n, 0, 1) x1 <- runif(n, 0, 1) f0 <- function(x) 2 * sin(pi * x) f1 <- function(x) 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10 f <- f0(x0) + f1(x1) e <- rnorm(n, 0, sig) y <- f + e # par(mfrow=c(3,2)) # Use estimated degrees of freedom (via cross validation) b<-gam(y~ s(x0)+s(x1) ) plot(b,pages=0) # Experiment with the degress of freedom: increase to df=10 for x0 b<-gam(y~ s(x0,k=10,fx=TRUE)+s(x1) ) plot(b,pages=0) # Experiment with the degress of freedom: decrease to df=3 for x0 b<-gam(y~ s(x0,k=3,fx=TRUE)+s(x1) ) plot(b,pages=0) # Look at the 3D picture par(mfrow=c(1,1)) b<-gam(y~ s(x0)+s(x1) ) vis.gam(b,se=0) # Question 3 # It is assumed that you have already loaded the data -- see solution to question 2. # Jitter the covariates so that there are no ties in x1 or in x2. # This simplifies the programing. The problem here is that the function "smooth.spline" # reduces the length of vectors with repeated values of the covariate. # That makes things much trickier to code. x0<-jitter(x0) x1<-jitter(x1) # Back-fitting algorithm (using approxfun) # Give the degrees of freedom to be used for smoothing over x0 and x1 # Question 4 library(sm) data(trees) attach(trees) par(mfrow=c(2,1)) # Use the crossvalidation estimates b<-gam(Volume~ s(Girth)+s(Height) ) plot(b,pages=0) # Look at the 3D picture par(mfrow=c(1,1)) b<-gam(Volume~ s(Girth)+s(Height) ) vis.gam(b,plot.type="persp",se=2,theta=-35) # # The End