Chapter 4. Least Squares Regression
chapter-4.Rmd
Load the data and create subsamples
We process the data with the following code:
library(hansen)
#>
#> Attaching package: 'hansen'
#> The following object is masked from 'package:stats':
#>
#> influence
experience <- cps09mar$age - cps09mar$education_num - 6
mbf <- (cps09mar$race_num == 2) & (cps09mar$marital_num <= 2) &
(cps09mar$female_num == 1) & (experience == 12)
sam <- (cps09mar$race_num == 4) & (cps09mar$marital_num == 7) &
(cps09mar$female_num == 0)
dat1 <- cps09mar[mbf, ]
dat2 <- cps09mar[sam, ]
dim(dat1)
#> [1] 20 18
dim(dat2)
#> [1] 268 18
Table 4.1
Using the OLS functions from Chapter 3, we can compute the standard errors with the following C++ code:
#include "00_main.h"
double> ols2_(const doubles_matrix<>& y, const doubles_matrix<>& x) {
Mat<double> Y = as_Mat(y);
Mat<double> X = as_Mat(x);
Mat<
double> beta = ols_(Y, X);
Mat<double> e = Y - (X * beta);
Mat<double> XXi = inv(X.t() * X);
Mat<double> leverage = sum(X % (X * XXi), 1);
Mat<
double n = Y.n_rows; // using int then returns the wrong "a" value
double k = X.n_cols;
double sig2 = as_scalar((e.t() * e) / (n - k));
double> ones = Mat<double>(1, k, fill::ones);
Mat<double> xx = inv(X.t() * X);
Mat<
// Heteroskedastic formula
double> v0 = xx * sig2;
Mat<double> s0 = sqrt(v0.diag());
Mat<
// White formula
double> u1 = X % (e * ones);
Mat<double> v1 = xx * (u1.t() * u1) * xx;
Mat<double> s1 = sqrt(v1.diag());
Mat<
// HC1 formula
double a = n / (n - k);
double> v1a = a * (xx * (u1.t() * u1) * xx);
Mat<double> s1a = sqrt(v1a.diag());
Mat<
// HC2 formula
double> u2 = X % ((e / sqrt(1 - leverage)) * ones);
Mat<double> v2 = xx * (u2.t() * u2) * xx;
Mat<double> s2 = sqrt(v2.diag());
Mat<
// HC3 formula
double> u3 = X % ((e / (1 - leverage)) * ones);
Mat<double> v3 = xx * (u3.t() * u3) * xx;
Mat<double> s3 = sqrt(v3.diag());
Mat<
// Bind the results
double> result = join_rows(s0, s1, s1a, s2);
Mat<// join_rows admits max 4 arguments
result = join_rows(result, s3); return result.t();
}
cpp11::register]] doubles_matrix<> ols2_r_(const doubles_matrix<>& y,
[[const doubles_matrix<>& x) {
return as_doubles_matrix(ols2(y, x));
}
Now the function can be used in R:
y <- as.matrix(log(dat1$earnings / (dat1$hours * dat1$week)))
x <- cbind(1, dat1$education_num)
table41 <- ols2(y, x)
rownames(table41) <- c("Homoskedastic (4.32)", "HC0 (4.33)", "HC1 (4.34)",
"HC2 (4.35)", "HC3 (4.36)")
colnames(table41) <- c("Intercept", "Education")
round(table41, 3)
#> Intercept Education
#> Homoskedastic (4.32) 0.707 0.045
#> HC0 (4.33) 0.461 0.029
#> HC1 (4.34) 0.486 0.030
#> HC2 (4.35) 0.493 0.031
#> HC3 (4.36) 0.527 0.033
Table 4.2
Similar to the first section, we need to create subtables:
edu12 <- (cps09mar$education_num > 11)
dat3 <- cps09mar[edu12, ]
marriedF <- (dat3$marital_num <= 3) & (dat3$female_num == 1)
marriedM <- (dat3$marital_num <= 3) & (dat3$female_num == 0)
unionF <- (dat3$union == 1) & (dat3$female_num == 1)
unionM <- (dat3$union == 1) & (dat3$female_num == 0)
fmarriedF <- (dat3$marital_num <= 6) & (dat3$marital_num > 3) &
(dat3$female_num == 1)
fmarriedM <- (dat3$marital_num <= 6) & (dat3$marital_num > 3) &
(dat3$female_num == 0)
black <- (dat3$race_num == 2)
american_indian <- (dat3$race_num == 3)
asian <- (dat3$race_num == 4)
mixed <- (dat3$race_num >= 6)
The C++ function for Table 4.2 is similar to the previous one:
double> cls_(const doubles_matrix<>& y, const doubles_matrix<>& x) {
Mat<double> Y = as_Mat(y);
Mat<double> X = as_Mat(x);
Mat<
double> beta = ols_(Y, X);
Mat<double> e = Y - (X * beta);
Mat<double> XXi = inv(X.t() * X);
Mat<double> leverage = sum(X % (X * XXi), 1);
Mat<
double n = Y.n_rows; // using int then returns the wrong "a" value
double k = X.n_cols;
double> ones = Mat<double>(1, k, fill::ones);
Mat<double> xx = inv(X.t() * X);
Mat<
// HC2 formula
double> u2 = X % ((e / sqrt(1 - leverage)) * ones);
Mat<double> v2 = xx * (u2.t() * u2) * xx;
Mat<double> s2 = sqrt(v2.diag());
Mat<
// Bind the results
double> result = join_rows(beta, s2);
Mat<return result;
}
cpp11::register]] doubles_matrix<> cls_r_(const doubles_matrix<>& y,
[[const doubles_matrix<>& x) {
return as_doubles_matrix(cls_(y, x));
}
Now we can replicate the results in R:
y <- as.matrix(log(dat3$earnings / (dat3$hours * dat3$week)))
experience <- dat3$age - dat3$education_num - 6
exp2 <- (experience^2) / 100
x <- cbind(
1, dat3$education_num, experience, exp2, dat3$female_num,
unionF, unionM, marriedF, marriedM, fmarriedF, fmarriedM,
dat3$hisp_num, black, american_indian, asian, mixed
)
table42 <- cls(y, x)
rownames(table42) <- c(
"Intercept", "Education", "Experience",
"Experience^2 / 100", "Female", "Female union member", "Male union member",
"Married female", "Married male", "Formerly married female",
"Formerly married male", "Hispanic", "Black", "American Indian", "Asian",
"Mixed race"
)
colnames(table42) <- c("Estimate", "Std. Error")
round(table42, 3)
#> Estimate Std. Error
#> Intercept 0.909 0.021
#> Education 0.117 0.001
#> Experience 0.033 0.001
#> Experience^2 / 100 -0.056 0.002
#> Female -0.098 0.011
#> Female union member 0.023 0.020
#> Male union member 0.095 0.020
#> Married female 0.016 0.010
#> Married male 0.211 0.010
#> Formerly married female -0.006 0.012
#> Formerly married male 0.083 0.015
#> Hispanic -0.108 0.008
#> Black -0.096 0.008
#> American Indian -0.137 0.026
#> Asian -0.038 0.013
#> Mixed race -0.041 0.021
Equation 4.57 using cluster-robust standard errors
The C++ code for the DDK (2011) cluster-robust standard errors is as follows:
double> ddk_(const doubles_matrix<>& y,
Mat<const doubles_matrix<>& x,
const doubles_matrix<>& z) {
double> Y = as_Mat(y);
Mat<double> X = as_Mat(x);
Mat<double> Z = as_Mat(z);
Mat<
int n = Y.n_rows;
int k = X.n_cols;
double> xx = X.t() * X;
Mat<double> xx_inv = inv(xx);
Mat<double> beta = ols_(Y, X);
Mat<
double> xe(n, k);
Mat<for (int j = 0; j < k; j++) {
xe.col(j) = X.col(j) % (Y - X * beta);
}
// Unique groups in Z
double> Z_unique = unique(Z);
Mat<int n_groups = Z_unique.n_rows;
// Grouped sums
double> xe_sum(n_groups, k, fill::zeros);
Mat<for (int g = 0; g < n_groups; g++) {
int group = Z_unique(g, 0);
for (int i = 0; i < n; ++i) {
if (Z(i, 0) == group) {
xe_sum.row(g) += xe.row(i);
}
}
}
double> omega = xe_sum.t() * xe_sum;
Mat<double scale = (n_groups / (n_groups - 1)) * ((n - 1) / (n - k));
double> V_clustered = scale * (xx_inv * omega * xx_inv);
Mat<double> se_clustered = sqrt(V_clustered.diag());
Mat<
return join_rows(beta, se_clustered);
}
cpp11::register]] doubles_matrix<> ddk_r_(const doubles_matrix<>& y,
[[const doubles_matrix<>& x,
const doubles_matrix<>& z) {
return as_doubles_matrix(ddk(y, x, z));
}
Now we can use the function in R:
y <- scale(as.matrix(ddk2011$totalscore))
x <- cbind(1, as.matrix(ddk2011$tracking_num))
z <- as.matrix(ddk2011$schoolid)
# if we do not change the storage type of x and z, the function will not work
# and will return an integer vs double error
storage.mode(x) <- "double"
storage.mode(z) <- "double"
ddk <- ddk(y, x, z)
rownames(ddk) <- c("Intercept", "Tracking")
colnames(ddk) <- c("Estimate", "Std. Error")
round(ddk, 3)
#> Estimate Std. Error
#> Intercept -0.071 0.054
#> Tracking 0.138 0.077