# CAPM linear regression
# read in the csv files
fb <- read.csv('FB.csv', header = TRUE)
sp_500 <- read.csv('SP500.csv', header = TRUE)
# joining the closing prices of the two datasets
monthly_prices = cbind(fb['Close'], sp_500['Close'])
colnames(monthly_prices) = c('FB', 'SP500')
# check the head of the dataframe
head(monthly_prices)
# calculate monthly returns
# Denote n the number of time periods:
n <- nrow(monthly_prices)
monthly_returns <- ((monthly_prices[2:n, ] - monthly_prices[1:(n-1), ])/monthly_prices[1:(n-1), ])
# note that this is already the clean_monthly_returns from the Python code
# split dependent and independent variable
X = as.matrix(monthly_returns['SP500'])
y = as.matrix(monthly_returns['FB'])
# the lm function automatically ads the intercept term
fit <- lm(y~X)
summary(fit) # for interpretation see below
# Interpretation:
# FB_returns = 0.020270 + 0.575091*SP500_returns
# visualize the results
coeff <- coefficients(fit)
plot(X,y, xlab = 'SP500 monthly returns', ylab = 'Facebook monthly returns')
abline(a=coeff[1], b=coeff[2], col=2, lwd=2)
# Digression on Linear Models
# This section helps you understand the outputs of the summary(fit) command
# In particular we will recode all outputs manually and try to interpret them
# For more details see Compuatational Statistics course
# Basic things we need:
n <- nrow(monthly_returns)
X = as.matrix(cbind(1,monthly_returns['SP500']))
y = as.matrix(monthly_returns['FB'])
XtX.inv <- solve(t(X) %*% X) # (X^T X)^{-1}
# Compute estimates by hand:
(hatbeta <- XtX.inv %*% t(X) %*% y) # hatbeta = (X^T X)^{-1} X^T y # If X has full rank, it is very easy (good exercise)
# to show that hatbeta is the minimizer of ||Y-Xb||^2
# over vectors in R^p
# Compare to summary(fit)
# Compute fitted values by hand:
y.hat <- X %*% hatbeta # yhat = X hatbeta
# Compare to fitted.values(fit.all):
max( abs(y.hat - fitted.values(fit)) )
# Re-compute residuals:
res <- y - y.hat # small difference due to roundings
# Compare to given residuals:
max(abs(res-residuals(fit)))
# Re-compute summary statistics of residuals
summary(res, digits=3) # Note some left skewness
# Compare to "Residuals" in summary(fit)
# Re-compute residual standard error (RSE):
p <- 2 # intercept and one variable
sum(res^2)/(n-p) # sigmahat^2 = RSS/(n-p)
(RSE <- sqrt( sum(res^2)/(n-p) )) # RSE = sqrt(RSS/(n-p))
# compare to "Residual standard error" in summary(fit)
# sigmahat^2 is a measure of goodness of fit.
# It is an estimate of sigma^2, the variance of the statistical errors
# The smaller the number, the better the fit (points closer to the line)
# The RSE is measured in the same units as the dependent variable.
# Plot
plot(residuals(fit), ylim=c(-6,6), main="Residuals")
# In case of normally distributed errors, we expect:
# About 66% of the points are within +/- hat.sigma
# from the regression plane (blue dotted lines)
# Aboute 95% of the points are within +/- 2*hat.sigma
# from the regression plane (orange dotted lines)
abline(h=0, lty=2)
abline(h=1.686, lty=3, col="blue", lwd=2)
abline(h=-1.686, lty=3, col="blue", lwd=2)
abline(h=2*1.686, lty=3, col="orange", lwd=2)
abline(h=-2*1.686, lty=3, col="orange", lwd=2)
# Re-compute R^2:
RSS <- sum( (y-y.hat)^2 )
TSS <- sum( (y-mean(y))^2 )
(Rsquared <- 1 - RSS/TSS)
# Interpretation: proportion of variance explained by regression model
# Compare to "Multiple R-squared" in summary(fit)
# Re-compute adjusted R^2:
(Rsquared.adj <- 1 - (RSS/(n-p))/(TSS/(n-1)))
# Adjusted for number of variables in the model
# Compare to "Adjusted R-squared" in summary(fit)
# Diagnostic plots
plot(fit, which=c(1,2)) # Tukey Anscombe plot and QQ plot of residuals
# Understand individual p-values:
# Re-compute standard error for SP500:
se.SP500 <- RSE * sqrt(XtX.inv[2,2])
se.SP500
# Reproduce t-value for SP500:
tval.SP500 <- hatbeta[2] / se.SP500
tval.SP500
# Compute p-value for SP500:
2*pt(abs(tval.SP500), df=n-p, lower=FALSE)
# p_value = P(observing a test statistics at least as extreme as the observed t_value under H0)
# We can also reproduce the p-value for SP500 by comparing two models:
fit.intercept_only <- lm(y~1)
# and conducting a partial F-test:
anova(fit.intercept_only, fit)
################################################################
# Overall F-test
# P-value at bottom right is from overall F-test
# This compares the full model against the empty model
anova(fit.intercept_only,fit)
# Good to look at this first, before looking at individual p-values
# This avoids multiple testing problems.
# Note that now the p_value of SP500 and the p_value from overall F-test coincide. This is because we only have one predictor variable in our model.
# In general the individual p_value for beta_k compare
# H0 model with beta_k = 0
# H1 same model but without restriction on beta_k