#
# load the quantreg and kernel smoothing libraries library
library(KernSmooth)
library(quantreg)
#
# read the weekly gas price data - 8/20/90 to 4/02/07
price<-read.table(file="C:/Documents and Settings/Andrew Buck/My Documents/Econ 616/gasprices.csv", header = TRUE, sep = ",", dec = ".",  na.strings = "NA",check.names = TRUE)
# put the price data in lower case x
x <- price[,2]
# what is the length of the series
 n <- length(x)
# set up the distributed lags in upper case X. 
 p <- 5
 X <- cbind(x[(p - 1):(n - 1)], x[(p - 2):(n - 2)], x[(p - 3):(n - 3)], x[(p - 4):(n - 4)])
# put current price in y
 y <- x[p:n]
#
# Assignment 1. ____________________________________________________________________________________
#
# Instead of yusing the data in the assignment I am using the gas price data. The hwk assignment exercises
# will be done using the residuals from an AR process
# Plot this weeks price against last weeks price
plot(X[,1],y,col="blue",cex=0.25,xlab="Last Week's Price",ylab="This Week's Price")
# Put in the some quantiles 
abline(rq(y ~ X[,1], tau = 0.1), col = "green")
abline(rq(y ~ X[,1], tau = 0.5), col = "green")
abline(rq(y ~ X[,1], tau = 0.9), col = "green")
qreg<-rq(y~X[,1]+X[,2]+X[,3]+X[,4],tau=0.5)
r1<-resid(qreg)
plot(r1)
# (a) 
# (ii)
set.seed(321)
samp100<-sample(r1,100)
samp015<-samp100[1:15]
samp020<-samp100[1:20]
samp050<-samp100[1:50]
#
# (b) (i)
# compare histogram and density
par(mfrow=c(2,1))
hist(samp015)
plot(density(samp015))
#
# (b) (ii)
# compare effect of sample size
par(mfrow=c(2,2))
plot(density(samp015))
plot(density(samp020))
plot(density(samp050))
plot(density(samp100))
#
# (c) (i)
#  ___________ bin width
par(mfrow=c(2,2))
rmin<-min(r1)
rmax<-max(r1)
hist(samp050,br=seq(rmin,rmax,10))
hist(samp050,br=seq(rmin,rmax,5))
hist(samp050,br=seq(rmin,rmax,2))
hist(samp050,br=seq(rmin,rmax,1))
#
#(c) (i)
# ____________ band width
plot(density(samp100,bw=0.1))
plot(density(samp100,bw=0.2))
plot(density(samp100,bw=0.5))
plot(density(samp100,bw=1.0))
#
# (c) (ii)
# ____________ kernel
par(mfrow=c(3,2))
plot(density(samp100, kernel = "gaussian"))
plot(density(samp100, kernel = "epanechnikov"))
plot(density(samp100, kernel = "triangular"))
plot(density(samp100, kernel = "biweight"))
plot(density(samp100, kernel = "cosine"))
plot(density(samp100, kernel = "optcosine"))
# 
# (c) (iii)
# ______________ number of arguments
par(mfrow=c(2,2))
plot(density(samp100,n=4))
plot(density(samp100,n=8))
plot(density(samp100,n=16))
plot(density(samp100,n=1024))
#
# Assinment 2.     ______________________________________________________________
#
# (a) Familiarize yourself with some funcitons.  You should have used similar funcitons in other software.
# The only new one should be approxfun, which does a linear approximaiton on a set of (x,y) pairs.
#
credit<-read.csv("C:/Documents and Settings/Andrew Buck/My Documents/Econ 616/Kredit.csv",header=T)
# _____________________  separate the good guys from the bad credit guys 
c0<-credit[credit[,1]==0,2]/1000
c1<-credit[credit[,1]==1,2]/1000
#
# (b) (i)
# _____________________  plot a histogram and the density for both groups and do the density with restricted support
# Just for fun I did this for both groups
#
par(mfrow=c(2,2))
temp0<-density(c0)
hist(c0,prob=T,ylim=c(0,0.3))
lines(temp0$x,temp0$y,col="red")
#
temp0<-density(c0,from=0,to=20)
hist(c0,prob=T,ylim=c(0,0.3))
lines(temp0$x,temp0$y,col="green")

temp1<-density(c1)
hist(c1,prob=T,ylim=c(0,0.4))
lines(temp1$x,temp1$y,col="red")
temp1<-density(c1,from=0,to=20)
hist(c1,prob=T,ylim=c(0,0.4))
lines(temp1$x,temp1$y,col="green")
# 
# (b) (ii)
# ____________________ estimate the area under the density for c0
# The first line gets interpolated y's for a set of x's
dc0<-approxfun(temp0$x,temp0$y,yleft=0,yright=0)
# The area is approximated by a bunch of rectangles under the curve.
del<-0.005
x<-seq(0,50,del)
y<-dc0(x)
area<-sum(y)*del
#
# (b) (iii)
#
# Rescale so that the density integrates to one
ty<-y/area
dc0<-approxfun(x,ty,yleft=0,yright=0)
temp0<-density(c0,from=0,to=20)
par(mfrow=c(2,2))
# Draw the histo and density again _________________
hist(c0,prob=T,ylim=c(0,0.25))
lines(temp0$x,temp0$y,col="green")
# Put in the rescaled empirical density ________________________
curve(dc0, 0,50, n = 101, add = T, type = "l",col="blue")
# Plot the CDF
tyc<-cumsum(ty)*del
pc0<-approxfun(x,tyc,yleft=0,yright=1)
curve(pc0, 0,20, n = 101, type = "l",col="blue")
# Plot the inverse of the CDF
qc0<-approxfun(tyc[!duplicated(tyc)],x[!duplicated(tyc)],yleft=-Inf,yright=+Inf)
curve(qc0, 0,1, n = 101, type = "l",col="blue")
# Generate some random numbers from the CDF and plot the to see if they mimic the data we started with
rc0<-function(n){qc0(runif(n))}
x<-rc0(1000)
hist(x,prob=T,ylim=c(0,0.3))
curve(dc0, 0,50, n = 101, add = T, type = "l",col="blue", ylim=c(0,0.3))
#
# _________________________  Assignment 3
#
# The given function
# A mixture of normals __________________________________
dmnorm <- function(x,alpha,mean,sd)
{
m<-length(alpha)
pdf<-rep(0,length(x))
for(i in 1:m)
{
pdf<-pdf+alpha[i]*dnorm(x,mean[i],sd[i])
}
return(pdf)
}
# (a) (i)
# Do the particular example
#
alpha<-c(.4,.3,.3)
mu<-c(10,20,30)
sig<-c(2,3,2)
x<-c(300:3600/100)
dens<-dmnorm(x,alpha,mu,sig)
plot(x,dens,type="l")
# 
# (a) (ii)
# A random number generator for the mixture
#
rmnorm<-function(n,alpha,mean,sd)
{
m<-length(alpha)
ind<-sample(1:m,n,replace=T,prob=alpha)
rx<-rnorm(n,mean[ind],sd[ind])
return(rx)
}

rx<-rmnorm(1000,alpha,mu,sig)
hist(rx,prob=T,ylim=c(0,0.1),nclass=20)
x<-seq(0,40,length=100)
pdf<-dmnorm(x,alpha,mu,sig)
lines(x,pdf,type="l",col="red") 
#
# (b)
# bandwidth experiments
#
library(sm)
par(mfrow=c(2,2))
x<-seq(0,40,length=100)
pdf<-dmnorm(x,alpha,mu,sig)

set.seed(1) 
n<-50
rx<-rmnorm(n,alpha,mu,sig)

hist(rx,prob=T,ylim=c(0,0.1))
lines(x,pdf,type="l",col="red") 

sm.density(rx,hnorm(rx),ylim=c(0,0.1))
lines(x,pdf,type="l",col="red") 

sm.density(rx,hcv(rx),ylim=c(0,0.1))
lines(x,pdf,type="l",col="red") 

sm.density(rx,hsj(rx),ylim=c(0,0.1))
lines(x,pdf,type="l",col="red") 
#
# (c)
# Repeat (b) with other sample sizes.  I didn't do this here.
#
# (d)
# I couldn't find the tree data that Zucchini had in mind so I am using a dataset already in R
# __________________________________Trees
attach(trees)
par(mfrow=c(2,2))
hist(Height,prob=T)
sm.density(Height,hnorm(Height),col="red")
sm.density(Height,hsj(Height),col="blue")
sm.density(Height,hcv(Height),col="green")

# _______________________________________________________  Assignment Four
#
data<-matrix(scan("E:/Kernel/strength.txt"),ncol=4,byrow=T)
colnames(data)<-c("grip","arm","rating","sims") 
n<-nrow(data) 
#
# (a) (i)
# See what some commands do
#
# (a) (ii)
# put columns of data into vectors appropriately named ______________ 
pairs(data)
grip<-data[,1]
arm<-data[,2]
rating<-data[,3]
sims<-data[,4]
#
# (a) (iii)
# Explore some more comands
#
cbind(grip,arm) # creates a matrix whose columns are grip and arm
cbind(1,arm) # creates a matrix whose first column contains ones and second arm
rbind(grip,arm) # creates a matrix whose rows are grip and arm
diag(1,3) # creates a 3x3 unit matrix 
diag(c(1,3)) # creates a 2x2 diagonal matrix whith entries 1 and 3.
#
# (b)
# Write an R function wls(y,x,w) that computes the parameters of the weighted least squares regression coefficients.
# 
#  The function input: 
#
# 	y is a vector of length n containing the target variables 
# 	x a matrix of explanatory variables (with n rows)
# 	w a vector of length n of weights

wls<-function(y,x,w)
{
X<-cbind(1,x)
W<-diag(w)
theta<-solve(t(X)%*%W%*%X)%*%t(X)%*%W%*%y
return(theta)
}

# Set up the weights (all equal to 1 start with.)
w<-rep(1,n)

# Write the covariates grip and arm in a matrix.
x<-cbind(grip,arm)

# Apply your function to estimate the regression coefficients
theta<-wls(rating,x,w)
theta


# Compare the result with the function lsfit 

fitmod<-lsfit(x,rating,w)
coefs<-fitmod$coeff
coefs



resids<-fitmod$residuals
hist(resids)
qqnorm(resids)
qqline(resids)

# Exploring weighted least squares for a simple univariate case

# (c) (i) Estimate using equal weights 

w<-rep(1,n)
theta<-wls(grip,arm,w)

#  Plot the data and the fitted line 

x0<-c(0,200)
y0<-theta[1]+theta[2]*x0
plot(arm,grip,xlim=x0,ylim=y0)
lines(x0,y0)

# (c) (ii) Raise the weights of the observations with 100< arm < 200 and reestimate the parameters

w[100<arm & arm<200]<-4
theta<-wls(grip,arm,w)

#  Plot the fitted line in red

x0<-c(0,200)
y0<-theta[1]+theta[2]*x0
lines(x0,y0,col="red")

# (c) (iii) Experiment with the weights, e.g. set some of the weights equal to zero 
# In each case estimate the parameters and plot the fitted line 
# Again, I didin't do this here.
#
#  ___________  Homework 5 _________________________________________--
#
# (a)(i)
# The function genreg(x,theta,sigma) generates realizations of y 
# from the following model
# y = theta[1] + theta[2]*x + theta[3]*x^2  theta[4]*x^3  + e   
# e are N(0,sigma^2) random deviates
genreg<-function(x,theta,sigma)
{
n<-length(x)
X<-cbind(1,x,x^2,x^3)
y<-X%*%theta+rnorm(n,0,sigma)
return(y)
}

# Plot some generated data 
set.seed(1) 
theta<-c(100,15,-2.5,0.10)
x<-1:20 
y<-genreg(x,theta,sigma=10)
plot(x,y)

# Add the following to the plot 
yt<-genreg(x,theta,sigma=0)
lines(x,yt,col="red")

# (a)(ii)
# Examine the effect of the bandwidth on the estimator of m(x)
# Load the sm library 
library(sm)

# Use sm.regression to estimate the mean function using different bandwidths 
par(mfrow=c(2,2))
for ( h in c(0.5,1,3,10))
{
sm.regression(x,y,h=h)
lines(x,yt,col="red")
title(paste("Bandwidth:  h=",h))
}

# (a)(iii)
# Examine the effect of the bandwidth on the expectation of the estimator
# (Repeat the above but also draw the true mean function, yt, on the displays)

for ( h in c(0.5,1,3,10))
{
sm.regression(x,yt,h=h,col="blue",points=F)
title(paste("Bandwidth:  h=",h))
lines(x,yt,col="red")
}
#
#
# (b)(i)
# Read the data and the country names

X<-matrix(scan("E:/Kernel/corruption.txt"),ncol=6,byrow=T)
cn<- scan("E:/kernel/country_abbrev.dat",what="character")
vnames<-c("CPI","Investment / GDP","Government deficit / GDP","GDP / Population in 1970","Reserve / GDP","Fuels & minerals / GDP")
#
# (b) (ii)
#
pairs(X)
#
# (b)(iii)
#
# Function to explore relationships using kernel regression
explore<-function(v1,v2,h){
x<-X[,v1]
y<-X[,v2]
plot(x,y,xlab=vnames[v1],ylab=vnames[v2])
sm.regression(x,y,h,col=2,add= T)
}
# Test the function explore  
# You should repeat the following instruction for 
# -- different values of the smoothing parameter h, and 
# -- different covariates
explore(4,1,500)

# Once you have selected h identify the interesting observations 
ident<-function(v1,v2,h){
x<-X[,v1]
y<-X[,v2]
plot(x,y,xlab=vnames[v1],ylab=vnames[v2])
sm.regression(x,y,h,col=2,add= T)
# The following line allows you to identify particular points in the graphics window
# I have set it to allow you to identify six points.  You could make this an argument to
# be passed by the user.
identify(x,y,cn,6)
}
# Test the function ident  
ident(4,1,500)
#
# (b)(iv)
# Kernel regressions using two covariates
explore2<-function(v1,v2,v3,h1,h2){
x<-X[,c(v1,v2)]
y<-X[,v3]
sm.regression(x,y,c(h1,h2),xlab= vnames[v1] ,ylab= vnames[v2] ,zlab= vnames[v3])
}
# Example 
explore2(1,2,3,100,100)

# Experiment with 
# -- the smoothing parameters 
# -- different sets of the above variables

#
#  ____________  Assignment 6 _______________________________________
#
# 1. (a)   complete.cases
#
x <- airquality[, -1] # x is a regression design matrix
y <- airquality[,  1] # y is the corresponding response
# The next line is a true false Boolean operator. false means an observation is missing.
stopifnot(complete.cases(y) != is.na(y))
# The next line does the same thing, but across the whole row of data
ok <- complete.cases(x,y)
sum(!ok) # how many are not "ok" ?
# replace x and y by the cases that are complete.
x <- x[ok,]
y <- y[ok]
#
# 1. (b)
# load modreg. This seems to be an error since smooth.spline is in the library stats
#
# An example of smooth.spline
attach(cars)
plot(speed, dist, main = "data(cars)  &  smoothing splines")
cars.spl <- smooth.spline(speed, dist)
(cars.spl)
## This example has duplicate points, so avoid cv=TRUE

lines(cars.spl, col = "blue")
lines(smooth.spline(speed, dist, df=10), lty=2, col = "red")
legend(5,120,c(paste("default [C.V.] => df =",round(cars.spl$df,1)),
               "s( * , df = 10)"), col = c("blue","red"), lty = 1:2,
       bg='bisque')
detach()
#
# 2(a)
#
# The function genmod(x,alpha,beta,sigma)generates realizations of y 
# from the given model. 
#
genmod<-function(x,alpha,beta,sigma)
{
foo<-2*pi/100
n<-length(x)
y<-alpha*cos(foo*x)+beta*sin(foo*x)+rnorm(n,0,sigma)
return(y)
}
#
# 2. (b)
#
# Generate a sample for the given parameter values
# Sorting the x-values makes it easier to plot later
set.seed(1) 
n<-100
alpha<-10
beta<-5
sigma<-1
x<-sort(runif(n,0,100))
y<-genmod(x,alpha,beta,sigma)
#
# 2. (c)
#
# Compute the true conditional mean
yt<-genmod(x,alpha,beta,sigma=0)
# Four different smoothing splines conditional on degrees of freedom
par(mfrow=c(2,2))
# default df, computed using cross validation
yfit<-smooth.spline(x,y)
plot(x,y) # plot the actual data
title(paste("Degrees of freedom=",round(yfit$df,1)))
# plot the true line in red then the fitted line in blue
lines(x,yt,col="red",lwd=3)
lines(yfit,col="blue",lwd=3)
# 3 df
df<-3
yfit<-smooth.spline(x,y,df=df)
plot(x,y)
title(paste("Degrees of freedom=",round(yfit$df,1)))
lines(x,yt,col="red",lwd=3)
lines(yfit,col="blue",lwd=3)

# 4 df
df<-4
yfit<-smooth.spline(x,y,df=df)
plot(x,y)
title(paste("Degrees of freedom=",round(yfit$df,1)))
lines(x,yt,col="red",lwd=3)
lines(yfit,col="blue",lwd=3)

# 20 df
df<-30
yfit<-smooth.spline(x,y,df=df)
plot(x,y)
title(paste("Degrees of freedom=",round(yfit$df,1)))
lines(x,yt,col="red",lwd=3)
lines(yfit,col="blue",lwd=3)
#
# 2(d)
#
# I'll just see how changing sigma effects the fits. 
# You should experiment with the other parameters too (especially n). 
set.seed(1)
sigma<-10
x<-sort(runif(n,0,100))
y<-genmod(x,alpha,beta,sigma)

# Compute the true mean
yt<-genmod(x,alpha,beta,sigma=0)
# Four different smoothing splines
par(mfrow=c(2,2))
# default df, computed using cross validation
yfit<-smooth.spline(x,y)
plot(x,y)
title(paste("Degrees of freedom=",round(yfit$df,1)))
lines(x,yt,col="red",lwd=3)
lines(yfit,col="blue",lwd=3)

# 3 df
df<-3
yfit<-smooth.spline(x,y,df=df)
plot(x,y)
title(paste("Degrees of freedom=",round(yfit$df,1)))
lines(x,yt,col="red",lwd=3)
lines(yfit,col="blue",lwd=3)

# 4 df
df<-4
yfit<-smooth.spline(x,y,df=df)
plot(x,y)
title(paste("Degrees of freedom=",round(yfit$df,1)))
lines(x,yt,col="red",lwd=3)
lines(yfit,col="blue",lwd=3)

# 20 df
df<-30
yfit<-smooth.spline(x,y,df=df)
plot(x,y)
title(paste("Degrees of freedom=",round(yfit$df,1)))
lines(x,yt,col="red",lwd=3)
lines(yfit,col="blue",lwd=3)
#
# 3.
#
X<-matrix(scan("E:/Kernel/corruption.txt"),ncol=6,byrow=T)
cn<- scan("E:/kernel/country_abbrev.dat",what="character")
vnames<-c("CPI","Investment / GDP","Government deficit / GDP","GDP / Population in 1970","Reserve / GDP","Fuels & minerals / GDP")
#
library(sm)
y<-dat[,1]
x<-dat[,4]
ind<-complete.cases(x,y)
ind
y<-y[ind]
x<-x[ind]
# Now that we have a set of data with the missing data dropped
#
par(mfrow=c(3,2))
# Use sm.regression to assess the relationship
# Two different bandwidths
#
yfitsm<-sm.regression(x,y,h=5.0,plot=T)
yfitsm<-sm.regression(x,y,h=15.0,plot=T)
#
# Now try smooth.spline
#
yfitsm<-smooth.spline(x,y,df=2)
plot(x,y,cex=0.5,main="Spline, df=2")
lines(yfitsm)
yfitsm<-smooth.spline(x,y,df=20)
plot(x,y,cex=0.5,main="Spline, df=20")
lines(yfitsm)
#
# Now try loess, locally weighted polynomial regression
#
yfitl<-lowess(x,y)
plot(x,y,cex=0.5,main="Lowess, f=default")
lines(yfitl)
yfitl<-lowess(x,y,f=0.2)
plot(x,y,cex=0.5,main="Lowess, f=2")
lines(yfitl)
# f the smoother span. This gives the proportion of points in the plot which influence the smooth
# at each value. Larger values give more smoothness 
#
#
#
#   ___________________  Assignment 7 ____________________________________
#
#
# 1.Jitter adds a small amount of noise to a data vector.
#    gam is a generalized additive smoothing model in mgcv.  An example follows
#
library(mgcv)
set.seed(0) 
n<-400
sig<-2
x0 <- runif(n, 0, 1)
x1 <- runif(n, 0, 1)
x2 <- runif(n, 0, 1)
x3 <- runif(n, 0, 1)
f0 <- function(x) 2 * sin(pi * x)
f1 <- function(x) exp(2 * x)
f2 <- function(x) 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10
f3 <- function(x) 0*x
f <- f0(x0) + f1(x1) + f2(x2)
e <- rnorm(n, 0, sig)
y <- f + e
b<-gam(y~s(x0)+s(x1)+s(x2)+s(x3))
summary(b)
plot(b,pages=1,residuals=TRUE)
# The plot shows the four planes
#
# 2. The problem set calls for a data set that I do not have, so we'll make our own
# based on the first question.
#
set.seed(0) 
n<-100
sig<-2
x0 <- runif(n, 0, 1)
x1 <- runif(n, 0, 1)
f0 <- function(x) 2 * sin(pi * x)
f1 <- function(x) 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10
f <- f0(x0) + f1(x1)
e <- rnorm(n, 0, sig)
y <- f + e
#
par(mfrow=c(3,2))
# Use estimated degrees of freedom (via cross validation)
b<-gam(y~ s(x0)+s(x1) )
plot(b,pages=0)

# Experiment with the degress of freedom: increase to df=10 for x0
b<-gam(y~ s(x0,k=10,fx=TRUE)+s(x1) )
plot(b,pages=0)

# Experiment with the degress of freedom: decrease to df=3 for x0
b<-gam(y~ s(x0,k=3,fx=TRUE)+s(x1) )
plot(b,pages=0)


# Look at the 3D picture
par(mfrow=c(1,1))
b<-gam(y~ s(x0)+s(x1) )
vis.gam(b,se=0) 


# Question 3

# It is assumed that you have already loaded the data -- see solution to question 2.

# Jitter the covariates so that there are no ties in x1 or in x2. 
# This simplifies the programing. The problem here is that the function "smooth.spline" 
# reduces the length of vectors with repeated values of the covariate.  
# That makes things much trickier to code.
 
x0<-jitter(x0)
x1<-jitter(x1)

# Back-fitting algorithm (using approxfun)

# Give the degrees of freedom to be used for smoothing over x0 and x1

# Question 4

library(sm)
data(trees)
attach(trees)
par(mfrow=c(2,1))

# Use the crossvalidation estimates 
b<-gam(Volume~ s(Girth)+s(Height) )
plot(b,pages=0)

# Look at the 3D picture
par(mfrow=c(1,1))
b<-gam(Volume~ s(Girth)+s(Height) )
vis.gam(b,plot.type="persp",se=2,theta=-35) 

#
# The End