Chapter 4. Least Squares Regression • hansen

Load the data and create subsamples

We process the data with the following code:

library(hansen)
#> 
#> Attaching package: 'hansen'
#> The following object is masked from 'package:stats':
#> 
#>     influence

experience <- cps09mar$age - cps09mar$education_num - 6
mbf <- (cps09mar$race_num == 2) & (cps09mar$marital_num <= 2) &
  (cps09mar$female_num == 1) & (experience == 12)
sam <- (cps09mar$race_num == 4) & (cps09mar$marital_num == 7) &
  (cps09mar$female_num == 0)
dat1 <- cps09mar[mbf, ]
dat2 <- cps09mar[sam, ]

dim(dat1)
#> [1] 20 18
dim(dat2)
#> [1] 268  18

Table 4.1

Using the OLS functions from Chapter 3, we can compute the standard errors with the following C++ code:

#include "00_main.h"

Mat<double> ols2_(const doubles_matrix<>& y, const doubles_matrix<>& x) {
  Mat<double> Y = as_Mat(y);
  Mat<double> X = as_Mat(x);

  Mat<double> beta = ols_(Y, X);
  Mat<double> e = Y - (X * beta);
  Mat<double> XXi = inv(X.t() * X);
  Mat<double> leverage = sum(X % (X * XXi), 1);

  double n = Y.n_rows; // using int then returns the wrong "a" value
  double k = X.n_cols;
  double sig2 = as_scalar((e.t() * e) / (n - k));

  Mat<double> ones = Mat<double>(1, k, fill::ones);
  Mat<double> xx = inv(X.t() * X);

  // Heteroskedastic formula
  Mat<double> v0 = xx * sig2;
  Mat<double> s0 = sqrt(v0.diag());

  // White formula
  Mat<double> u1 = X % (e * ones);
  Mat<double> v1 = xx * (u1.t() * u1) * xx;
  Mat<double> s1 = sqrt(v1.diag());

  // HC1 formula
  double a = n / (n - k);
  Mat<double> v1a = a * (xx * (u1.t() * u1) * xx);
  Mat<double> s1a = sqrt(v1a.diag());

  // HC2 formula
  Mat<double> u2 = X % ((e / sqrt(1 - leverage)) * ones);
  Mat<double> v2 = xx * (u2.t() * u2) * xx;
  Mat<double> s2 = sqrt(v2.diag());

  // HC3 formula
  Mat<double> u3 = X % ((e / (1 - leverage)) * ones);
  Mat<double> v3 = xx * (u3.t() * u3) * xx;
  Mat<double> s3 = sqrt(v3.diag());

  // Bind the results
  Mat<double> result = join_rows(s0, s1, s1a, s2);
  result = join_rows(result, s3); // join_rows admits max 4 arguments
  return result.t();
}

[[cpp11::register]] doubles_matrix<> ols2_r_(const doubles_matrix<>& y,
                                             const doubles_matrix<>& x) {
  return as_doubles_matrix(ols2(y, x));
}

Now the function can be used in R:

y <- as.matrix(log(dat1$earnings / (dat1$hours * dat1$week)))
x <- cbind(1, dat1$education_num)

table41 <- ols2(y, x)

rownames(table41) <- c("Homoskedastic (4.32)", "HC0 (4.33)", "HC1 (4.34)",
  "HC2 (4.35)", "HC3 (4.36)")
colnames(table41) <- c("Intercept", "Education")

round(table41, 3)
#>                      Intercept Education
#> Homoskedastic (4.32)     0.707     0.045
#> HC0 (4.33)               0.461     0.029
#> HC1 (4.34)               0.486     0.030
#> HC2 (4.35)               0.493     0.031
#> HC3 (4.36)               0.527     0.033

Table 4.2

Similar to the first section, we need to create subtables:

edu12 <- (cps09mar$education_num > 11)
dat3 <- cps09mar[edu12, ]

marriedF <- (dat3$marital_num <= 3) & (dat3$female_num == 1)
marriedM <- (dat3$marital_num <= 3) & (dat3$female_num == 0)
unionF <- (dat3$union == 1) & (dat3$female_num == 1)
unionM <- (dat3$union == 1) & (dat3$female_num == 0)
fmarriedF <- (dat3$marital_num <= 6) & (dat3$marital_num > 3) &
  (dat3$female_num == 1)
fmarriedM <- (dat3$marital_num <= 6) & (dat3$marital_num > 3) &
  (dat3$female_num == 0)
black <- (dat3$race_num == 2)
american_indian <- (dat3$race_num == 3)
asian <- (dat3$race_num == 4)
mixed <- (dat3$race_num >= 6)

The C++ function for Table 4.2 is similar to the previous one:

Mat<double> cls_(const doubles_matrix<>& y, const doubles_matrix<>& x) {
  Mat<double> Y = as_Mat(y);
  Mat<double> X = as_Mat(x);

  Mat<double> beta = ols_(Y, X);
  Mat<double> e = Y - (X * beta);
  Mat<double> XXi = inv(X.t() * X);
  Mat<double> leverage = sum(X % (X * XXi), 1);

  double n = Y.n_rows;  // using int then returns the wrong "a" value
  double k = X.n_cols;

  Mat<double> ones = Mat<double>(1, k, fill::ones);
  Mat<double> xx = inv(X.t() * X);

  // HC2 formula
  Mat<double> u2 = X % ((e / sqrt(1 - leverage)) * ones);
  Mat<double> v2 = xx * (u2.t() * u2) * xx;
  Mat<double> s2 = sqrt(v2.diag());

  // Bind the results
  Mat<double> result = join_rows(beta, s2);
  return result;
}

[[cpp11::register]] doubles_matrix<> cls_r_(const doubles_matrix<>& y,
                                            const doubles_matrix<>& x) {
  return as_doubles_matrix(cls_(y, x));
}

Now we can replicate the results in R:

y <- as.matrix(log(dat3$earnings / (dat3$hours * dat3$week)))
experience <- dat3$age - dat3$education_num - 6
exp2 <- (experience^2) / 100
x <- cbind(
  1, dat3$education_num, experience, exp2, dat3$female_num,
  unionF, unionM, marriedF, marriedM, fmarriedF, fmarriedM,
  dat3$hisp_num, black, american_indian, asian, mixed
)

table42 <- cls(y, x)

rownames(table42) <- c(
  "Intercept", "Education", "Experience",
  "Experience^2 / 100", "Female", "Female union member", "Male union member",
  "Married female", "Married male", "Formerly married female",
  "Formerly married male", "Hispanic", "Black", "American Indian", "Asian",
  "Mixed race"
)
colnames(table42) <- c("Estimate", "Std. Error")

round(table42, 3)
#>                         Estimate Std. Error
#> Intercept                  0.909      0.021
#> Education                  0.117      0.001
#> Experience                 0.033      0.001
#> Experience^2 / 100        -0.056      0.002
#> Female                    -0.098      0.011
#> Female union member        0.023      0.020
#> Male union member          0.095      0.020
#> Married female             0.016      0.010
#> Married male               0.211      0.010
#> Formerly married female   -0.006      0.012
#> Formerly married male      0.083      0.015
#> Hispanic                  -0.108      0.008
#> Black                     -0.096      0.008
#> American Indian           -0.137      0.026
#> Asian                     -0.038      0.013
#> Mixed race                -0.041      0.021

Equation 4.57 using cluster-robust standard errors

The C++ code for the DDK (2011) cluster-robust standard errors is as follows:

Mat<double> ddk_(const doubles_matrix<>& y,
                 const doubles_matrix<>& x,
                 const doubles_matrix<>& z) {
  Mat<double> Y = as_Mat(y);
  Mat<double> X = as_Mat(x);
  Mat<double> Z = as_Mat(z);

  int n = Y.n_rows;
  int k = X.n_cols;

  Mat<double> xx = X.t() * X;
  Mat<double> xx_inv = inv(xx);
  Mat<double> beta = ols_(Y, X);
  
  Mat<double> xe(n, k);
  for (int j = 0; j < k; j++) {
    xe.col(j) = X.col(j) % (Y - X * beta);
  }

  // Unique groups in Z
  Mat<double> Z_unique = unique(Z);
  int n_groups = Z_unique.n_rows;

  // Grouped sums
  Mat<double> xe_sum(n_groups, k, fill::zeros);
  for (int g = 0; g < n_groups; g++) {
    int group = Z_unique(g, 0);
    for (int i = 0; i < n; ++i) {
      if (Z(i, 0) == group) {
        xe_sum.row(g) += xe.row(i);
      }
    }
  }

  Mat<double> omega = xe_sum.t() * xe_sum;
  double scale = (n_groups / (n_groups - 1)) * ((n - 1) / (n - k));
  Mat<double> V_clustered = scale * (xx_inv * omega * xx_inv);
  Mat<double> se_clustered = sqrt(V_clustered.diag());

  return join_rows(beta, se_clustered);
}

[[cpp11::register]] doubles_matrix<> ddk_r_(const doubles_matrix<>& y,
                                            const doubles_matrix<>& x,
                                            const doubles_matrix<>& z) {
  return as_doubles_matrix(ddk(y, x, z));
}

Now we can use the function in R:

y <- scale(as.matrix(ddk2011$totalscore))
x <- cbind(1, as.matrix(ddk2011$tracking_num))
z <- as.matrix(ddk2011$schoolid)

# if we do not change the storage type of x and z, the function will not work
# and will return an integer vs double error

storage.mode(x) <- "double"
storage.mode(z) <- "double"

ddk <- ddk(y, x, z)
rownames(ddk) <- c("Intercept", "Tracking")
colnames(ddk) <- c("Estimate", "Std. Error")

round(ddk, 3)
#>           Estimate Std. Error
#> Intercept   -0.071      0.054
#> Tracking     0.138      0.077