###############################################################
# LABOUR ECONOMICS – PROBLEM SET 1.2
# Measurement Error, Missing Variables, and IV Estimation
# Author: Bademba Drammeh
# Date: 2025-11-10
###############################################################
# --- SETUP ---
rm(list = ls())
options(scipen = 5)
# --- Load required libraries ---
packages <- c("psych", "ggplot2", "stargazer", "AER", "dplyr")
installed <- packages %in% installed.packages()
if (any(!installed)) install.packages(packages[!installed])
lapply(packages, library, character.only = TRUE)
## Warning: Paket 'psych' wurde unter R Version 4.4.3 erstellt
## Warning: Paket 'ggplot2' wurde unter R Version 4.4.3 erstellt
##
## Attache Paket: 'ggplot2'
## Die folgenden Objekte sind maskiert von 'package:psych':
##
## %+%, alpha
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## Warning: Paket 'AER' wurde unter R Version 4.4.3 erstellt
## Lade nötiges Paket: car
## Warning: Paket 'car' wurde unter R Version 4.4.3 erstellt
## Lade nötiges Paket: carData
## Warning: Paket 'carData' wurde unter R Version 4.4.3 erstellt
##
## Attache Paket: 'car'
## Das folgende Objekt ist maskiert 'package:psych':
##
## logit
## Lade nötiges Paket: lmtest
## Warning: Paket 'lmtest' wurde unter R Version 4.4.3 erstellt
## Lade nötiges Paket: zoo
## Warning: Paket 'zoo' wurde unter R Version 4.4.3 erstellt
##
## Attache Paket: 'zoo'
## Die folgenden Objekte sind maskiert von 'package:base':
##
## as.Date, as.Date.numeric
## Lade nötiges Paket: sandwich
## Warning: Paket 'sandwich' wurde unter R Version 4.4.3 erstellt
## Lade nötiges Paket: survival
## Warning: Paket 'dplyr' wurde unter R Version 4.4.3 erstellt
##
## Attache Paket: 'dplyr'
## Das folgende Objekt ist maskiert 'package:car':
##
## recode
## Die folgenden Objekte sind maskiert von 'package:stats':
##
## filter, lag
## Die folgenden Objekte sind maskiert von 'package:base':
##
## intersect, setdiff, setequal, union
## [[1]]
## [1] "psych" "stats" "graphics" "grDevices" "utils" "datasets"
## [7] "methods" "base"
##
## [[2]]
## [1] "ggplot2" "psych" "stats" "graphics" "grDevices" "utils"
## [7] "datasets" "methods" "base"
##
## [[3]]
## [1] "stargazer" "ggplot2" "psych" "stats" "graphics" "grDevices"
## [7] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "AER" "survival" "sandwich" "lmtest" "zoo" "car"
## [7] "carData" "stargazer" "ggplot2" "psych" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "dplyr" "AER" "survival" "sandwich" "lmtest" "zoo"
## [7] "car" "carData" "stargazer" "ggplot2" "psych" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
# --- LOAD DATA ---
setwd("C:/Users/bdrammeh/Desktop/Labour Economics/assignmnt")
load("ps1_more_realistic_data.Rda")
# Check what objects were loaded
print(ls())
## [1] "df2" "installed" "packages"
# If your dataset has a different name, rename it here:
#df2 <- ps1_more_realistic_data
# Otherwise, if df2 already exists, continue
# Inspect structure and summary
str(df2)
## 'data.frame': 5000 obs. of 4 variables:
## $ education : num 14 14 17 14 12 14 16 10 11 13 ...
## $ wage_premium: int 1 0 1 0 0 0 1 0 0 0 ...
## $ hours_ME : num 7.72 8.7 9.33 8.08 8.04 ...
## $ ln_wage_ME : num 3.53 5.14 3.99 2.5 3.62 ...
summary(df2)
## education wage_premium hours_ME ln_wage_ME
## Min. :10.00 Min. :0.0000 Min. :6.074 Min. :0.002279
## 1st Qu.:10.00 1st Qu.:0.0000 1st Qu.:7.573 1st Qu.:2.776697
## Median :12.00 Median :0.0000 Median :7.937 Median :3.349062
## Mean :12.77 Mean :0.3422 Mean :7.948 Mean :3.356330
## 3rd Qu.:14.00 3rd Qu.:1.0000 3rd Qu.:8.323 3rd Qu.:3.943028
## Max. :24.00 Max. :1.0000 Max. :9.828 Max. :6.344340
# --- Data preparation ---
#df2 <- subset(df2, wage_ME > 0)
#df2$ln_wage_ME <- log(df2$wage_ME)
# --- (a) DESCRIPTIVE STATISTICS ---
#install.packages("psych")
#library(psych)
cat("\n### (a) Descriptive Statistics\n")
##
## ### (a) Descriptive Statistics
describe(df2[, c("education", "wage_premium", "hours_ME", "ln_wage_ME")])
## vars n mean sd median trimmed mad min max range skew
## education 1 5000 12.77 2.42 12.00 12.52 2.97 10.00 24.00 14.00 0.67
## wage_premium 2 5000 0.34 0.47 0.00 0.30 0.00 0.00 1.00 1.00 0.66
## hours_ME 3 5000 7.95 0.55 7.94 7.94 0.55 6.07 9.83 3.75 0.09
## ln_wage_ME 4 5000 3.36 0.86 3.35 3.35 0.86 0.00 6.34 6.34 0.01
## kurtosis se
## education -0.15 0.03
## wage_premium -1.56 0.01
## hours_ME -0.10 0.01
## ln_wage_ME 0.08 0.01
# Density and histogram plots
ggplot(df2, aes(x = ln_wage_ME)) +
geom_density(fill = "lightblue", alpha = 0.6) +
labs(title = "Density of Log(Wage) with Measurement Error",
x = "ln(Wage_ME)", y = "Density")

ggplot(df2, aes(x = hours_ME)) +
geom_histogram(bins = 20, fill = "lightgreen", color = "black") +
labs(title = "Histogram of Hours Worked (Measured)",
x = "Hours (ME)", y = "Count")

# (a) Descriptive Statistics – Summary Interpretation
#The dataset shows that education averages around 12–14 years, indicating most individuals completed secondary school with some postsecondary education.
#The wage premium variable is centered around zero, consistent with it being randomly assigned as part of an income support experiment.
#Average measured hours worked (hours_ME) are roughly 8 hours per day, though there is noticeable variation due to measurement noise.
#The log of measured wages (ln_wage_ME) displays a distribution that is less smooth and more dispersed compared to the clean data in PS1.1, reflecting the presence of measurement error.
#Overall Interpretation:
#Compared to the earlier clean dataset, these descriptive patterns reveal greater variability and less symmetry, confirming that the “more realistic” #data include measurement error and random assignment effects.
#Despite this added noise, the wage premium variable appears balanced, supporting its use as a valid instrument in later analyses.
# Descriptive stats show that measurement-error variables (ln_wage_ME, hours_ME)
# are noisier, with larger variance and possible missing motivation data.
# --- (b) REGRESSION: Hours on Log(Wage) with Measurement Error ---
cat("\n### (b) OLS Regression with Measurement Error Variables\n")
##
## ### (b) OLS Regression with Measurement Error Variables
model_ME <- lm(hours_ME ~ ln_wage_ME, data = df2)
summary(model_ME)
##
## Call:
## lm(formula = hours_ME ~ ln_wage_ME, data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.86052 -0.37018 -0.00967 0.36631 1.81559
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.651493 0.031170 245.475 <2e-16 ***
## ln_wage_ME 0.088266 0.008999 9.808 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5443 on 4998 degrees of freedom
## Multiple R-squared: 0.01888, Adjusted R-squared: 0.01869
## F-statistic: 96.2 on 1 and 4998 DF, p-value: < 2.2e-16
ggplot(df2, aes(x = ln_wage_ME, y = hours_ME)) +
geom_point(alpha = 0.5, color = "steelblue") +
geom_smooth(method = "lm", color = "red", se = TRUE) +
labs(title = "Hours (ME) vs Log(Wage) (ME)",
x = "Log(Wage) (with Measurement Error)",
y = "Hours Worked (Measured)")
## `geom_smooth()` using formula = 'y ~ x'

# (b) OLS Regression with Measurement Error – Summary Interpretation
#The OLS regression of hours on logged measured wages shows a weaker relationship than in the clean dataset (PS1.1), with a smaller coefficient and lower R².
#This occurs because measurement error in ln_wage_ME causes attenuation bias, leading OLS to underestimate the true effect of wages on hours worked.
#In short, the observed elasticity of labor supply is biased toward zero due to noisy wage data
# OLS (hours_ME ~ ln_wage_ME) likely gives a smaller coefficient than before,
# due to attenuation bias from measurement error
#library(dplyr)
library(magrittr)
# --- (c) CHECK FOR INSTRUMENT VALIDITY ---
# Calculate correlation matrix and store it
correlations <- cor(df2[, c("education", "wage_premium", "hours_ME", "ln_wage_ME")],
use = "complete.obs")
# Print it
print(correlations)
## education wage_premium hours_ME ln_wage_ME
## education 1.00000000 0.01368347 0.57102052 0.1451116
## wage_premium 0.01368347 1.00000000 0.07193791 0.1964186
## hours_ME 0.57102052 0.07193791 1.00000000 0.1374181
## ln_wage_ME 0.14511160 0.19641863 0.13741806 1.0000000
#Interpretation Example:
#The wage_premium is correlated with ln_wage_ME but not with education, indicating it’s likely exogenous.
# This supports its validity as an instrument for wages.
# The wage_premium should correlate with wages, but not with education/motivation.
# This supports its use as an instrument.
# --- (d) BALANCEDNESS CHECK ---
#cat("\n### (d) Balancedness Check on Motivation by Wage Premium Group\n")
#df2$treat_group <- ifelse(df2$wage_premium > 0, "Treatment", "Control")
#balance_test <- t.test(wage_premium ~ treat_group, data = df2)
#print(balance_test)
# Create treatment group
#df2$treat_group <- ifelse(df2$wage_premium > 0, "Treatment", "Control")
# Balancedness check: compare motivation
#balance_test <- t.test(wage_premium ~ treat_group, data = df2)
# Print results
#print(balance_test)
# Create treatment group based on wage premium
df2$treat_group <- ifelse(df2$wage_premium > 0, "Treatment", "Control")
# Balancedness check: compare motivation across groups
#balance_test <- t.test(motivation ~ treat_group, data = df2)
#print(balance_test)
# Balancedness check: motivation should not differ significantly
# between treatment and control groups (p > 0.05).
#install.packages("AER")
#library(AER)
# --- (e) IV ESTIMATION ---
cat("\n### (e) IV Estimation: Wage Premium as Instrument for Log(Wage)\n")
##
## ### (e) IV Estimation: Wage Premium as Instrument for Log(Wage)
iv_model1 <- ivreg(hours_ME ~ ln_wage_ME | wage_premium, data = df2)
summary(iv_model1, diagnostics = TRUE)
##
## Call:
## ivreg(formula = hours_ME ~ ln_wage_ME | wage_premium, data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.946754 -0.374956 -0.008143 0.367165 1.977099
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.15818 0.15802 45.298 < 2e-16 ***
## ln_wage_ME 0.23525 0.04702 5.003 0.000000585 ***
##
## Diagnostic tests:
## df1 df2 statistic p-value
## Weak instruments 1 4998 200.56 < 2e-16 ***
## Wu-Hausman 1 4997 10.72 0.00106 **
## Sargan 0 NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5587 on 4998 degrees of freedom
## Multiple R-Squared: -0.03348, Adjusted R-squared: -0.03369
## Wald test: 25.03 on 1 and 4998 DF, p-value: 0.0000005847
cat("\n### First-Stage Regression\n")
##
## ### First-Stage Regression
first_stage <- lm(ln_wage_ME ~ wage_premium, data = df2)
summary(first_stage)
##
## Call:
## lm(formula = ln_wage_ME ~ wage_premium, data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2329 -0.5703 -0.0064 0.5727 3.1092
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.23515 0.01463 221.17 <2e-16 ***
## wage_premium 0.35412 0.02501 14.16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8389 on 4998 degrees of freedom
## Multiple R-squared: 0.03858, Adjusted R-squared: 0.03839
## F-statistic: 200.6 on 1 and 4998 DF, p-value: < 2.2e-16
# Interpretation :
# The IV coefficient is larger than the OLS estimate, consistent with attenuation bias correction.
# The first-stage regression confirms that wage_premium significantly predicts ln_wage_ME (strong instrument).
#install.packages("stargazer")
#library(stargazer)
# --- (f) COMPARE OLS VS IV RESULTS ---
cat("\n### (f) Comparing OLS vs IV Models\n")
##
## ### (f) Comparing OLS vs IV Models
stargazer(model_ME, iv_model1, type = "text",
title = "OLS vs IV Estimation (Measurement Error)",
dep.var.labels = "Hours Worked (Measured)",
covariate.labels = c("Log(Wage) (ME)"))
##
## OLS vs IV Estimation (Measurement Error)
## =====================================================================
## Dependent variable:
## -------------------------------------
## Hours Worked (Measured)
## OLS instrumental
## variable
## (1) (2)
## ---------------------------------------------------------------------
## Log(Wage) (ME) 0.088*** 0.235***
## (0.009) (0.047)
##
## Constant 7.651*** 7.158***
## (0.031) (0.158)
##
## ---------------------------------------------------------------------
## Observations 5,000 5,000
## R2 0.019 -0.033
## Adjusted R2 0.019 -0.034
## Residual Std. Error (df = 4998) 0.544 0.559
## F Statistic 96.197*** (df = 1; 4998)
## =====================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# Interpretation :
# The IV estimate suggests a stronger relationship between wages and hours.
# This aligns with theory: measurement error in OLS biases the coefficient downward
# Compare OLS vs IV: IV coefficient on ln_wage_ME should be larger and more consistent.
# --- (g) IV ESTIMATION WITH EDUCATION CONTROL ---
cat("\n### (g) IV Estimation Adding Education\n")
##
## ### (g) IV Estimation Adding Education
iv_model2 <- ivreg(hours_ME ~ ln_wage_ME + education | wage_premium + education, data = df2)
summary(iv_model2, diagnostics = TRUE)
##
## Call:
## ivreg(formula = hours_ME ~ ln_wage_ME + education | wage_premium +
## education, data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.820363 -0.320934 -0.007087 0.320817 1.917858
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.723029 0.114740 49.878 < 2e-16 ***
## ln_wage_ME 0.211836 0.040317 5.254 0.000000155 ***
## education 0.118571 0.003452 34.352 < 2e-16 ***
##
## Diagnostic tests:
## df1 df2 statistic p-value
## Weak instruments 1 4997 200.8 < 2e-16 ***
## Wu-Hausman 1 4996 22.1 0.00000266 ***
## Sargan 0 NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4742 on 4997 degrees of freedom
## Multiple R-Squared: 0.2556, Adjusted R-squared: 0.2553
## Wald test: 1108 on 2 and 4997 DF, p-value: < 2.2e-16
# Interpretation :
# Including education slightly changes coefficients but not their significance.
# This indicates the IV estimate is robust to including observable human capital controls.
# Adding education (iv_model2) tests robustness; coefficients should remain similar.
# --- (h) COMPARE FINAL MODELS ---
cat("\n### (h) Comparison: OLS (ME) vs IV Models (Simple & with Education)\n")
##
## ### (h) Comparison: OLS (ME) vs IV Models (Simple & with Education)
stargazer(model_ME, iv_model1, iv_model2, type = "text",
title = "Comparison of Models: OLS (ME) vs IV (Simple & with Education)",
dep.var.labels = "Hours Worked (Measured)",
covariate.labels = c("Log(Wage)", "Education"))
##
## Comparison of Models: OLS (ME) vs IV (Simple & with Education)
## ================================================================================
## Dependent variable:
## ------------------------------------------------------------
## Hours Worked (Measured)
## OLS instrumental
## variable
## (1) (2) (3)
## --------------------------------------------------------------------------------
## Log(Wage) 0.088*** 0.235*** 0.212***
## (0.009) (0.047) (0.040)
##
## Education 0.119***
## (0.003)
##
## Constant 7.651*** 7.158*** 5.723***
## (0.031) (0.158) (0.115)
##
## --------------------------------------------------------------------------------
## Observations 5,000 5,000 5,000
## R2 0.019 -0.033 0.256
## Adjusted R2 0.019 -0.034 0.255
## Residual Std. Error 0.544 (df = 4998) 0.559 (df = 4998) 0.474 (df = 4997)
## F Statistic 96.197*** (df = 1; 4998)
## ================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# Interpretation Example:
# The preferred model is the IV with education (model g), as it corrects for measurement error and controls for observable skill differences.
# The estimated elasticity of labor supply (response of hours to wage changes) is more realistic and theoretically consistent.
# Compare with PS1.1 results: IV gives a consistent estimate of labour supply elasticity.