###############################################################
# LABOUR ECONOMICS – PROBLEM SET 1.2
# Measurement Error, Missing Variables, and IV Estimation
# Author: Bademba Drammeh
# Date: 2025-11-10
###############################################################

# --- SETUP ---
rm(list = ls())
options(scipen = 5)

# --- Load required libraries ---
packages <- c("psych", "ggplot2", "stargazer", "AER", "dplyr")
installed <- packages %in% installed.packages()
if (any(!installed)) install.packages(packages[!installed])

lapply(packages, library, character.only = TRUE)
## Warning: Paket 'psych' wurde unter R Version 4.4.3 erstellt
## Warning: Paket 'ggplot2' wurde unter R Version 4.4.3 erstellt
## 
## Attache Paket: 'ggplot2'
## Die folgenden Objekte sind maskiert von 'package:psych':
## 
##     %+%, alpha
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## Warning: Paket 'AER' wurde unter R Version 4.4.3 erstellt
## Lade nötiges Paket: car
## Warning: Paket 'car' wurde unter R Version 4.4.3 erstellt
## Lade nötiges Paket: carData
## Warning: Paket 'carData' wurde unter R Version 4.4.3 erstellt
## 
## Attache Paket: 'car'
## Das folgende Objekt ist maskiert 'package:psych':
## 
##     logit
## Lade nötiges Paket: lmtest
## Warning: Paket 'lmtest' wurde unter R Version 4.4.3 erstellt
## Lade nötiges Paket: zoo
## Warning: Paket 'zoo' wurde unter R Version 4.4.3 erstellt
## 
## Attache Paket: 'zoo'
## Die folgenden Objekte sind maskiert von 'package:base':
## 
##     as.Date, as.Date.numeric
## Lade nötiges Paket: sandwich
## Warning: Paket 'sandwich' wurde unter R Version 4.4.3 erstellt
## Lade nötiges Paket: survival
## Warning: Paket 'dplyr' wurde unter R Version 4.4.3 erstellt
## 
## Attache Paket: 'dplyr'
## Das folgende Objekt ist maskiert 'package:car':
## 
##     recode
## Die folgenden Objekte sind maskiert von 'package:stats':
## 
##     filter, lag
## Die folgenden Objekte sind maskiert von 'package:base':
## 
##     intersect, setdiff, setequal, union
## [[1]]
## [1] "psych"     "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [7] "methods"   "base"     
## 
## [[2]]
## [1] "ggplot2"   "psych"     "stats"     "graphics"  "grDevices" "utils"    
## [7] "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "stargazer" "ggplot2"   "psych"     "stats"     "graphics"  "grDevices"
##  [7] "utils"     "datasets"  "methods"   "base"     
## 
## [[4]]
##  [1] "AER"       "survival"  "sandwich"  "lmtest"    "zoo"       "car"      
##  [7] "carData"   "stargazer" "ggplot2"   "psych"     "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[5]]
##  [1] "dplyr"     "AER"       "survival"  "sandwich"  "lmtest"    "zoo"      
##  [7] "car"       "carData"   "stargazer" "ggplot2"   "psych"     "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"
# --- LOAD DATA ---
setwd("C:/Users/bdrammeh/Desktop/Labour Economics/assignmnt")
load("ps1_more_realistic_data.Rda")

# Check what objects were loaded
print(ls())
## [1] "df2"       "installed" "packages"
# If your dataset has a different name, rename it here:
#df2 <- ps1_more_realistic_data
# Otherwise, if df2 already exists, continue

# Inspect structure and summary
str(df2)
## 'data.frame':    5000 obs. of  4 variables:
##  $ education   : num  14 14 17 14 12 14 16 10 11 13 ...
##  $ wage_premium: int  1 0 1 0 0 0 1 0 0 0 ...
##  $ hours_ME    : num  7.72 8.7 9.33 8.08 8.04 ...
##  $ ln_wage_ME  : num  3.53 5.14 3.99 2.5 3.62 ...
summary(df2)
##    education      wage_premium       hours_ME       ln_wage_ME      
##  Min.   :10.00   Min.   :0.0000   Min.   :6.074   Min.   :0.002279  
##  1st Qu.:10.00   1st Qu.:0.0000   1st Qu.:7.573   1st Qu.:2.776697  
##  Median :12.00   Median :0.0000   Median :7.937   Median :3.349062  
##  Mean   :12.77   Mean   :0.3422   Mean   :7.948   Mean   :3.356330  
##  3rd Qu.:14.00   3rd Qu.:1.0000   3rd Qu.:8.323   3rd Qu.:3.943028  
##  Max.   :24.00   Max.   :1.0000   Max.   :9.828   Max.   :6.344340
# --- Data preparation ---
#df2 <- subset(df2, wage_ME > 0)
#df2$ln_wage_ME <- log(df2$wage_ME)

# --- (a) DESCRIPTIVE STATISTICS ---

#install.packages("psych")
#library(psych)
cat("\n### (a) Descriptive Statistics\n")
## 
## ### (a) Descriptive Statistics
describe(df2[, c("education", "wage_premium", "hours_ME", "ln_wage_ME")])
##              vars    n  mean   sd median trimmed  mad   min   max range skew
## education       1 5000 12.77 2.42  12.00   12.52 2.97 10.00 24.00 14.00 0.67
## wage_premium    2 5000  0.34 0.47   0.00    0.30 0.00  0.00  1.00  1.00 0.66
## hours_ME        3 5000  7.95 0.55   7.94    7.94 0.55  6.07  9.83  3.75 0.09
## ln_wage_ME      4 5000  3.36 0.86   3.35    3.35 0.86  0.00  6.34  6.34 0.01
##              kurtosis   se
## education       -0.15 0.03
## wage_premium    -1.56 0.01
## hours_ME        -0.10 0.01
## ln_wage_ME       0.08 0.01
# Density and histogram plots
ggplot(df2, aes(x = ln_wage_ME)) +
  geom_density(fill = "lightblue", alpha = 0.6) +
  labs(title = "Density of Log(Wage) with Measurement Error",
       x = "ln(Wage_ME)", y = "Density")

ggplot(df2, aes(x = hours_ME)) +
  geom_histogram(bins = 20, fill = "lightgreen", color = "black") +
  labs(title = "Histogram of Hours Worked (Measured)",
       x = "Hours (ME)", y = "Count")

# (a) Descriptive Statistics – Summary Interpretation

#The dataset shows that education averages around 12–14 years, indicating most individuals completed secondary school with some postsecondary education.
#The wage premium variable is centered around zero, consistent with it being randomly assigned as part of an income support experiment.
#Average measured hours worked (hours_ME) are roughly 8 hours per day, though there is noticeable variation due to measurement noise.
#The log of measured wages (ln_wage_ME) displays a distribution that is less smooth and more dispersed compared to the clean data in PS1.1, reflecting the presence of measurement error.

#Overall Interpretation:
#Compared to the earlier clean dataset, these descriptive patterns reveal greater variability and less symmetry, confirming that the “more realistic” #data include measurement error and random assignment effects.
#Despite this added noise, the wage premium variable appears balanced, supporting its use as a valid instrument in later analyses.
# Descriptive stats show that measurement-error variables (ln_wage_ME, hours_ME)
    # are noisier, with larger variance and possible missing motivation data.
# --- (b) REGRESSION: Hours on Log(Wage) with Measurement Error ---
cat("\n### (b) OLS Regression with Measurement Error Variables\n")
## 
## ### (b) OLS Regression with Measurement Error Variables
model_ME <- lm(hours_ME ~ ln_wage_ME, data = df2)

summary(model_ME)
## 
## Call:
## lm(formula = hours_ME ~ ln_wage_ME, data = df2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.86052 -0.37018 -0.00967  0.36631  1.81559 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.651493   0.031170 245.475   <2e-16 ***
## ln_wage_ME  0.088266   0.008999   9.808   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5443 on 4998 degrees of freedom
## Multiple R-squared:  0.01888,    Adjusted R-squared:  0.01869 
## F-statistic:  96.2 on 1 and 4998 DF,  p-value: < 2.2e-16
ggplot(df2, aes(x = ln_wage_ME, y = hours_ME)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  labs(title = "Hours (ME) vs Log(Wage) (ME)",
       x = "Log(Wage) (with Measurement Error)",
       y = "Hours Worked (Measured)")
## `geom_smooth()` using formula = 'y ~ x'

# (b) OLS Regression with Measurement Error – Summary Interpretation

#The OLS regression of hours on logged measured wages shows a weaker relationship than in the clean dataset (PS1.1), with a smaller coefficient and lower R².
#This occurs because measurement error in ln_wage_ME causes attenuation bias, leading OLS to underestimate the true effect of wages on hours worked.
#In short, the observed elasticity of labor supply is biased toward zero due to noisy wage data
# OLS (hours_ME ~ ln_wage_ME) likely gives a smaller coefficient than before,
    # due to attenuation bias from measurement error
#library(dplyr)

library(magrittr)
# --- (c) CHECK FOR INSTRUMENT VALIDITY ---

# Calculate correlation matrix and store it
correlations <- cor(df2[, c("education", "wage_premium", "hours_ME", "ln_wage_ME")], 
                    use = "complete.obs")

# Print it
print(correlations)
##               education wage_premium   hours_ME ln_wage_ME
## education    1.00000000   0.01368347 0.57102052  0.1451116
## wage_premium 0.01368347   1.00000000 0.07193791  0.1964186
## hours_ME     0.57102052   0.07193791 1.00000000  0.1374181
## ln_wage_ME   0.14511160   0.19641863 0.13741806  1.0000000
#Interpretation Example:

#The wage_premium is correlated with ln_wage_ME but not with education, indicating it’s likely exogenous.
# This supports its validity as an instrument for wages.
# The wage_premium should correlate with wages, but not with education/motivation.
    # This supports its use as an instrument.
# --- (d) BALANCEDNESS CHECK ---
#cat("\n### (d) Balancedness Check on Motivation by Wage Premium Group\n")
#df2$treat_group <- ifelse(df2$wage_premium > 0, "Treatment", "Control")
#balance_test <- t.test(wage_premium ~ treat_group, data = df2)
#print(balance_test)
# Create treatment group
#df2$treat_group <- ifelse(df2$wage_premium > 0, "Treatment", "Control")

# Balancedness check: compare motivation
#balance_test <- t.test(wage_premium ~ treat_group, data = df2)
# Print results
#print(balance_test)

# Create treatment group based on wage premium
df2$treat_group <- ifelse(df2$wage_premium > 0, "Treatment", "Control")

# Balancedness check: compare motivation across groups
#balance_test <- t.test(motivation ~ treat_group, data = df2)
#print(balance_test)
 # Balancedness check: motivation should not differ significantly
   # between treatment and control groups (p > 0.05).
#install.packages("AER")
#library(AER)

# --- (e) IV ESTIMATION ---
cat("\n### (e) IV Estimation: Wage Premium as Instrument for Log(Wage)\n")
## 
## ### (e) IV Estimation: Wage Premium as Instrument for Log(Wage)
iv_model1 <- ivreg(hours_ME ~ ln_wage_ME | wage_premium, data = df2)
summary(iv_model1, diagnostics = TRUE)
## 
## Call:
## ivreg(formula = hours_ME ~ ln_wage_ME | wage_premium, data = df2)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -1.946754 -0.374956 -0.008143  0.367165  1.977099 
## 
## Coefficients:
##             Estimate Std. Error t value    Pr(>|t|)    
## (Intercept)  7.15818    0.15802  45.298     < 2e-16 ***
## ln_wage_ME   0.23525    0.04702   5.003 0.000000585 ***
## 
## Diagnostic tests:
##                   df1  df2 statistic p-value    
## Weak instruments    1 4998    200.56 < 2e-16 ***
## Wu-Hausman          1 4997     10.72 0.00106 ** 
## Sargan              0   NA        NA      NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5587 on 4998 degrees of freedom
## Multiple R-Squared: -0.03348,    Adjusted R-squared: -0.03369 
## Wald test: 25.03 on 1 and 4998 DF,  p-value: 0.0000005847
cat("\n### First-Stage Regression\n")
## 
## ### First-Stage Regression
first_stage <- lm(ln_wage_ME ~ wage_premium, data = df2)
summary(first_stage)
## 
## Call:
## lm(formula = ln_wage_ME ~ wage_premium, data = df2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2329 -0.5703 -0.0064  0.5727  3.1092 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.23515    0.01463  221.17   <2e-16 ***
## wage_premium  0.35412    0.02501   14.16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8389 on 4998 degrees of freedom
## Multiple R-squared:  0.03858,    Adjusted R-squared:  0.03839 
## F-statistic: 200.6 on 1 and 4998 DF,  p-value: < 2.2e-16
# Interpretation :

# The IV coefficient is larger than the OLS estimate, consistent with attenuation bias correction.
# The first-stage regression confirms that wage_premium significantly predicts ln_wage_ME (strong instrument).
#install.packages("stargazer")
#library(stargazer)
# --- (f) COMPARE OLS VS IV RESULTS ---
cat("\n### (f) Comparing OLS vs IV Models\n")
## 
## ### (f) Comparing OLS vs IV Models
stargazer(model_ME, iv_model1, type = "text",
          title = "OLS vs IV Estimation (Measurement Error)",
          dep.var.labels = "Hours Worked (Measured)",
          covariate.labels = c("Log(Wage) (ME)"))
## 
## OLS vs IV Estimation (Measurement Error)
## =====================================================================
##                                          Dependent variable:         
##                                 -------------------------------------
##                                        Hours Worked (Measured)       
##                                           OLS            instrumental
##                                                            variable  
##                                           (1)                (2)     
## ---------------------------------------------------------------------
## Log(Wage) (ME)                          0.088***           0.235***  
##                                         (0.009)            (0.047)   
##                                                                      
## Constant                                7.651***           7.158***  
##                                         (0.031)            (0.158)   
##                                                                      
## ---------------------------------------------------------------------
## Observations                             5,000              5,000    
## R2                                       0.019              -0.033   
## Adjusted R2                              0.019              -0.034   
## Residual Std. Error (df = 4998)          0.544              0.559    
## F Statistic                     96.197*** (df = 1; 4998)             
## =====================================================================
## Note:                                     *p<0.1; **p<0.05; ***p<0.01
# Interpretation :

# The IV estimate suggests a stronger relationship between wages and hours.
# This aligns with theory: measurement error in OLS biases the coefficient downward
# Compare OLS vs IV: IV coefficient on ln_wage_ME should be larger and more consistent.
# --- (g) IV ESTIMATION WITH EDUCATION CONTROL ---
cat("\n### (g) IV Estimation Adding Education\n")
## 
## ### (g) IV Estimation Adding Education
iv_model2 <- ivreg(hours_ME ~ ln_wage_ME + education | wage_premium + education, data = df2)
summary(iv_model2, diagnostics = TRUE)
## 
## Call:
## ivreg(formula = hours_ME ~ ln_wage_ME + education | wage_premium + 
##     education, data = df2)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -1.820363 -0.320934 -0.007087  0.320817  1.917858 
## 
## Coefficients:
##             Estimate Std. Error t value    Pr(>|t|)    
## (Intercept) 5.723029   0.114740  49.878     < 2e-16 ***
## ln_wage_ME  0.211836   0.040317   5.254 0.000000155 ***
## education   0.118571   0.003452  34.352     < 2e-16 ***
## 
## Diagnostic tests:
##                   df1  df2 statistic    p-value    
## Weak instruments    1 4997     200.8    < 2e-16 ***
## Wu-Hausman          1 4996      22.1 0.00000266 ***
## Sargan              0   NA        NA         NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4742 on 4997 degrees of freedom
## Multiple R-Squared: 0.2556,  Adjusted R-squared: 0.2553 
## Wald test:  1108 on 2 and 4997 DF,  p-value: < 2.2e-16
# Interpretation :

# Including education slightly changes coefficients but not their significance.
# This indicates the IV estimate is robust to including observable human capital controls.
# Adding education (iv_model2) tests robustness; coefficients should remain similar.
# --- (h) COMPARE FINAL MODELS ---
cat("\n### (h) Comparison: OLS (ME) vs IV Models (Simple & with Education)\n")
## 
## ### (h) Comparison: OLS (ME) vs IV Models (Simple & with Education)
stargazer(model_ME, iv_model1, iv_model2, type = "text",
          title = "Comparison of Models: OLS (ME) vs IV (Simple & with Education)",
          dep.var.labels = "Hours Worked (Measured)",
          covariate.labels = c("Log(Wage)", "Education"))
## 
## Comparison of Models: OLS (ME) vs IV (Simple & with Education)
## ================================================================================
##                                         Dependent variable:                     
##                     ------------------------------------------------------------
##                                       Hours Worked (Measured)                   
##                               OLS                       instrumental            
##                                                           variable              
##                               (1)                   (2)               (3)       
## --------------------------------------------------------------------------------
## Log(Wage)                   0.088***             0.235***          0.212***     
##                             (0.009)               (0.047)           (0.040)     
##                                                                                 
## Education                                                          0.119***     
##                                                                     (0.003)     
##                                                                                 
## Constant                    7.651***             7.158***          5.723***     
##                             (0.031)               (0.158)           (0.115)     
##                                                                                 
## --------------------------------------------------------------------------------
## Observations                 5,000                 5,000             5,000      
## R2                           0.019                -0.033             0.256      
## Adjusted R2                  0.019                -0.034             0.255      
## Residual Std. Error    0.544 (df = 4998)     0.559 (df = 4998) 0.474 (df = 4997)
## F Statistic         96.197*** (df = 1; 4998)                                    
## ================================================================================
## Note:                                                *p<0.1; **p<0.05; ***p<0.01
# Interpretation Example:

# The preferred model is the IV with education (model g), as it corrects for measurement error and controls for observable skill differences.
# The estimated elasticity of labor supply (response of hours to wage changes) is more realistic and theoretically consistent.
# Compare with PS1.1 results: IV gives a consistent estimate of labour supply elasticity.