library(haven)
## Warning: Paket 'haven' wurde unter R Version 4.4.3 erstellt
library(foreign)
## Warning: Paket 'foreign' wurde unter R Version 4.4.3 erstellt
library(lmtest)
## Warning: Paket 'lmtest' wurde unter R Version 4.4.3 erstellt
## Lade nötiges Paket: zoo
## Warning: Paket 'zoo' wurde unter R Version 4.4.3 erstellt
## 
## Attache Paket: 'zoo'
## Die folgenden Objekte sind maskiert von 'package:base':
## 
##     as.Date, as.Date.numeric
library(Hmisc)
## Warning: Paket 'Hmisc' wurde unter R Version 4.4.3 erstellt
## 
## Attache Paket: 'Hmisc'
## Die folgenden Objekte sind maskiert von 'package:base':
## 
##     format.pval, units
library(plm)
## Warning: Paket 'plm' wurde unter R Version 4.4.3 erstellt
library(dplyr)
## Warning: Paket 'dplyr' wurde unter R Version 4.4.3 erstellt
## 
## Attache Paket: 'dplyr'
## Die folgenden Objekte sind maskiert von 'package:plm':
## 
##     between, lag, lead
## Die folgenden Objekte sind maskiert von 'package:Hmisc':
## 
##     src, summarize
## Die folgenden Objekte sind maskiert von 'package:stats':
## 
##     filter, lag
## Die folgenden Objekte sind maskiert von 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(lfe)
## Warning: Paket 'lfe' wurde unter R Version 4.4.3 erstellt
## Lade nötiges Paket: Matrix
## 
## Attache Paket: 'lfe'
## Das folgende Objekt ist maskiert 'package:plm':
## 
##     sargan
## Das folgende Objekt ist maskiert 'package:lmtest':
## 
##     waldtest
library(tidyr)
## Warning: Paket 'tidyr' wurde unter R Version 4.4.3 erstellt
## 
## Attache Paket: 'tidyr'
## Die folgenden Objekte sind maskiert von 'package:Matrix':
## 
##     expand, pack, unpack
library(readr)
## Warning: Paket 'readr' wurde unter R Version 4.4.3 erstellt
library(coefplot)
## Warning: Paket 'coefplot' wurde unter R Version 4.4.3 erstellt
## Lade nötiges Paket: ggplot2
## Warning: Paket 'ggplot2' wurde unter R Version 4.4.3 erstellt
library(lmtest)
library(sandwich)
## Warning: Paket 'sandwich' wurde unter R Version 4.4.3 erstellt
library(knitr)
## Warning: Paket 'knitr' wurde unter R Version 4.4.3 erstellt
library(rmarkdown)
## Warning: Paket 'rmarkdown' wurde unter R Version 4.4.3 erstellt
#library(tinytex)

#set the path
setwd("C:/Users/bdrammeh/Desktop/Labour Economics/assignmnt")


# Clear desk
rm(list=ls())


######### Question 4.1 #########
# write some verbal answers in your solution pdf...


######### Question 4.2 #########

#read in the data
twins_long <- read_dta("twins_long.dta")

#descriptive analysis
attach(twins_long)
summary(twins_long)
##      famid          age             twin          educ           lwage       
##  Min.   :  1   Min.   :18.78   Min.   :1.0   Min.   : 8.00   Min.   :0.5108  
##  1st Qu.: 38   1st Qu.:29.05   1st Qu.:1.0   1st Qu.:12.00   1st Qu.:1.9459  
##  Median : 75   Median :35.20   Median :1.5   Median :14.00   Median :2.3770  
##  Mean   : 75   Mean   :36.56   Mean   :1.5   Mean   :14.11   Mean   :2.3786  
##  3rd Qu.:112   3rd Qu.:43.02   3rd Qu.:2.0   3rd Qu.:16.00   3rd Qu.:2.7485  
##  Max.   :149   Max.   :64.14   Max.   :2.0   Max.   :20.00   Max.   :4.6052  
##       male            white       
##  Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :0.0000   Median :1.0000  
##  Mean   :0.4564   Mean   :0.9362  
##  3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000
# plot for age 
table(age)
## age
## 18.7816600799561 19.2224502563477 19.8987007141113 20.4298400878906 
##                2                2                2                2 
##  21.349760055542 21.7412700653076 21.8042392730713 21.9274501800537 
##                2                2                2                2 
## 22.0725498199463 22.1546897888184 22.4722805023193 23.0006809234619 
##                2                2                2                2 
##  23.682409286499 23.8904895782471 23.9698791503906 24.1177291870117 
##                2                2                2                2 
## 24.6105403900146 24.6242294311523 24.6406593322754 24.7310104370117 
##                2                2                2                2 
## 24.8377799987793 25.1772804260254  25.541410446167 25.9411392211914 
##                2                2                2                2 
## 26.6693992614746 26.9267597198486  27.175910949707 27.2306594848633 
##                2                2                2                2 
## 27.2772102355957 27.4195709228516  27.748119354248    28.1533203125 
##                2                2                2                2 
##  28.323070526123 28.4134197235107 28.5311393737793 28.7364807128906 
##                2                4                2                2 
##  29.054069519043  29.185489654541  29.338809967041 29.3908290863037 
##                2                2                2                2 
## 29.8617401123047 30.1492099761963 30.2258701324463 30.3518104553223 
##                2                2                2                2 
## 30.4503803253174 30.6529808044434 30.8199901580811  30.967830657959 
##                2                2                2                2 
## 31.0307998657227 31.0663890838623 31.2087593078613 31.4578990936279 
##                2                2                2                2 
## 31.7672805786133  31.808349609375 31.8987007141113  32.290210723877 
##                2                2                2                2 
## 32.3477096557617 32.3586616516113 32.4462699890137  32.594108581543 
##                2                2                2                2 
##   32.84326171875 33.0376510620117 33.2512016296387 33.4072608947754 
##                2                2                2                2 
## 33.4620094299316  33.590690612793 33.6399688720703  33.741268157959 
##                2                2                2                2 
## 33.9548301696777 34.6338081359863 34.8665313720703 34.9787788391113 
##                2                2                2                2 
##  35.134838104248 35.2005500793457 35.6084899902344 35.6632499694824 
##                2                2                2                2 
##   36.07666015625 36.3641395568848 36.3723487854004  36.481861114502 
##                2                2                2                2 
## 36.6214904785156 36.9418182373047 36.9445610046387 36.9856300354004 
##                2                2                2                2 
## 37.0239601135254 37.2183418273926 37.2813110351562  37.284049987793 
##                2                2                2                2 
## 37.3744010925293  37.722110748291 37.8398399353027 37.8507881164551 
##                2                2                2                2 
##  38.026008605957 38.5325088500977 38.7707099914551 39.3401794433594 
##                2                2                2                2 
##  39.351131439209 39.6194381713867 40.0492782592773 40.3121109008789 
##                2                2                2                2 
## 40.3586616516113 40.5010299682617 40.8733711242676 41.0841903686523 
##                2                2                2                2 
## 41.3579788208008  42.086238861084 42.1218299865723 42.5489387512207 
##                2                2                2                2 
## 42.7707099914551 42.8501014709473 43.0198516845703 43.2005500793457 
##                2                2                2                2 
## 43.3045806884766 43.4223098754883 43.5701599121094 43.8548889160156 
##                2                2                2                2 
## 44.0191688537598 44.2190284729004 44.2327194213867 44.7063598632812 
##                2                2                2                2 
## 44.8487281799316 45.3935699462891 45.5550994873047 45.6125907897949 
##                2                2                2                2 
## 46.3655014038086 47.6057510375977 47.7426414489746 48.1204605102539 
##                2                2                2                2 
## 48.2491493225098  50.050651550293 50.1492118835449 50.5489387512207 
##                2                2                2                2 
##  51.331958770752 51.8412094116211  51.901439666748 52.9336090087891 
##                2                2                2                2 
## 53.6372299194336 53.6646118164062 53.8535194396973  56.479118347168 
##                2                2                2                2 
## 57.6865196228027 58.1327896118164 61.1581115722656 61.4702301025391 
##                2                2                2                2 
##  61.872688293457 61.9575614929199 63.7179985046387 64.1396331787109 
##                2                2                2                2
fit2 <- lm(lwage ~ poly(age, degree = 2, raw = TRUE), data = data.frame(age, lwage))
fit4 <- lm(lwage ~ poly(age, degree = 4, raw = TRUE), data = data.frame(age, lwage))
age_seq <- seq(min(age), max(age), length.out = 100)
lwage_pred2 <- predict(fit2, newdata = data.frame(age = age_seq))
lwage_pred4 <- predict(fit4, newdata = data.frame(age = age_seq))

plot(0, type = "n", xlim = range(age_seq), ylim = range(c(lwage_pred2, lwage_pred4)),
     xlab = "Age", ylab = "log wage", main = "Life-Cycle Profiles with Polynomial Fits")
lines(age_seq, lwage_pred2, col = "red", lwd = 2, lty = 1, label = "2nd Order")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "label" ist kein
## Grafikparameter
lines(age_seq, lwage_pred4, col = "blue", lwd = 2, lty = 2, label = "4th Order")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "label" ist kein
## Grafikparameter
legend("bottomright", legend = c("2nd Order", "4th Order"), col = c("red", "blue"), lty = 2:3, cex = 0.8)

# plot for educ
table(educ)
## educ
##  8 10 11 12 13 14 15 16 17 18 19 20 
##  1  3  1 95 36 46 24 61  3 19  4  5
fit2 <- lm(lwage ~ poly(educ, degree = 2, raw = TRUE), data = data.frame(educ,lwage))
fit4 <- lm(lwage ~ poly(educ, degree = 4, raw = TRUE), data = data.frame(educ,lwage))
educ_seq <- seq(min(educ), max(educ), length.out = 100)
lwage_pred2 <- predict(fit2, newdata = data.frame(educ = educ_seq))
lwage_pred4 <- predict(fit4, newdata = data.frame(educ = educ_seq))

plot(0, type = "n", xlim = range(educ_seq), ylim = range(c(lwage_pred2, lwage_pred4)),
     xlab = "Years of Education", ylab = "log wage", main = "Education--Wage Profiles with Polynomial Fits")
lines(educ_seq, lwage_pred2, col = "red", lwd = 2, lty = 1, label = "2nd Order")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "label" ist kein
## Grafikparameter
lines(educ_seq, lwage_pred4, col = "blue", lwd = 2, lty = 2, label = "4th Order")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "label" ist kein
## Grafikparameter
legend("bottomright", legend = c("2nd Order", "4th Order"), col = c("red", "blue"), lty = 2:3, cex = 0.8)

# plot for potential experience (usually people force within 0-40 years)
potexp <- pmin(pmax(age - educ - 6,0),40)
table(potexp)
## potexp
##                  0 0.0725498199462891  0.154689788818359  0.429840087890625 
##                  2                  2                  2                  2 
##  0.472280502319336  0.610540390014648  0.682409286499023  0.731010437011719 
##                  1                  2                  1                  2 
##  0.898700714111328   1.22245025634766   1.34976005554199   1.47228050231934 
##                  2                  2                  2                  1 
##   1.68240928649902   1.74127006530762   1.96987915039062   2.11772918701172 
##                  1                  1                  2                  2 
##   2.62422943115234   2.64065933227539   2.74127006530762   2.80423927307129 
##                  1                  1                  1                  2 
##   2.92745018005371   3.00068092346191   3.17728042602539   3.62422943115234 
##                  1                  1                  2                  1 
##   3.89048957824707   3.92745018005371   4.00068092346191   4.64065933227539 
##                  2                  1                  1                  1 
##   4.66939926147461   4.92675971984863   5.17591094970703   5.23065948486328 
##                  1                  1                  2                  1 
##   5.66939926147461   6.32307052612305   6.41341972351074   6.44626998901367 
##                  1                  2                  3                  1 
##   6.54141044616699   6.73648071289062     6.808349609375    6.8377799987793 
##                  1                  2                  1                  2 
##   6.92675971984863       7.1533203125    7.2772102355957   7.54141044616699 
##                  1                  2                  1                  1 
##   7.76728057861328   7.94113922119141   8.22587013244629   8.41341972351074 
##                  1                  2                  1                  1 
##   8.41957092285156    8.5311393737793   8.81999015808105    9.0663890838623 
##                  1                  1                  1                  2 
##   9.14920997619629   9.23065948486328    9.2772102355957   9.33880996704102 
##                  1                  1                  1                  1 
##   9.41957092285156   9.45038032531738   9.45789909362793   9.74811935424805 
##                  1                  1                  1                  2 
##   9.76728057861328   9.81999015808105    10.054069519043   10.2087593078613 
##                  1                  1                  2                  1 
##    10.290210723877   10.3586616516113   10.4503803253174   10.5311393737793 
##                  2                  1                  1                  1 
##   10.6529808044434     10.84326171875   10.8987007141113    11.185489654541 
##                  1                  1                  2                  2 
##   11.2512016296387    11.338809967041   11.3908290863037   11.4462699890137 
##                  2                  1                  2                  1 
##   11.4578990936279    11.590690612793   11.6529808044434    11.741268157959 
##                  1                  1                  1                  1 
##   11.8617401123047   12.0376510620117   12.1492099761963   12.2005500793457 
##                  2                  2                  1                  1 
##   12.2258701324463   12.3518104553223   12.3723487854004    12.590690612793 
##                  1                  2                  2                  1 
##    12.594108581543     12.84326171875    12.967830657959   13.0307998657227 
##                  1                  1                  2                  2 
##   13.2005500793457   13.4072608947754   13.4620094299316    13.741268157959 
##                  1                  1                  1                  1 
##    13.808349609375   13.9548301696777   13.9787788391113   14.3477096557617 
##                  1                  1                  1                  2 
##   14.3586616516113   14.4072608947754   14.6338081359863   14.6632499694824 
##                  1                  1                  2                  1 
##   14.8665313720703   14.9418182373047   14.9445610046387   14.9856300354004 
##                  2                  1                  2                  1 
##   15.4620094299316    15.594108581543   15.6084899902344   15.6194381713867 
##                  1                  1                  2                  1 
##   15.6399688720703    15.722110748291   15.9418182373047   15.9548301696777 
##                  2                  2                  1                  1 
##   15.9787788391113   16.6214904785156   16.8507881164551   16.9856300354004 
##                  1                  1                  1                  1 
##    17.086238861084    17.134838104248   17.2087593078613   17.4223098754883 
##                  1                  2                  1                  1 
##   17.6632499694824   17.8398399353027   17.8548889160156   18.0239601135254 
##                  1                  2                  1                  1 
##     18.07666015625   18.1218299865723   18.2813110351562   18.3641395568848 
##                  2                  2                  1                  2 
##   18.3744010925293    18.481861114502   18.5010299682617   18.5489387512207 
##                  1                  2                  2                  1 
##   18.5701599121094   18.6214904785156   18.7707099914551   18.8507881164551 
##                  1                  1                  1                  1 
##   19.0239601135254   19.2183418273926   19.2813110351562    19.284049987793 
##                  1                  2                  1                  2 
##   19.3045806884766   19.3401794433594   19.3579788208008   19.3744010925293 
##                  1                  1                  1                  1 
##   19.5325088500977   19.7707099914551   20.0191688537598    20.026008605957 
##                  1                  1                  1                  2 
##    20.086238861084   20.2327194213867   20.3121109008789   20.3586616516113 
##                  1                  2                  1                  1 
##   20.5325088500977   20.5489387512207   20.7063598632812   20.7707099914551 
##                  1                  1                  1                  1 
##   20.8501014709473   21.0492782592773   21.0841903686523   21.3045806884766 
##                  2                  2                  1                  1 
##   21.3401794433594    21.351131439209   21.4223098754883   21.6194381713867 
##                  1                  2                  1                  1 
##   22.0191688537598   22.0198516845703   22.0841903686523   22.3121109008789 
##                  1                  1                  1                  1 
##   22.3579788208008   22.3586616516113   22.7707099914551   22.8548889160156 
##                  1                  1                  1                  1 
##   22.8733711242676   23.1204605102539   23.2005500793457   23.7063598632812 
##                  2                  1                  1                  1 
##   25.0198516845703   25.1204605102539   25.2190284729004   25.3935699462891 
##                  1                  1                  1                  2 
##   25.5701599121094   25.6125907897949   25.8487281799316   26.2190284729004 
##                  1                  1                  2                  1 
##   26.9336090087891   27.2005500793457    27.331958770752   27.5550994873047 
##                  2                  1                  2                  2 
##   27.6125907897949   28.3655014038086   29.6057510375977   29.8412094116211 
##                  1                  2                  2                  1 
##   30.2491493225098   31.7426414489746    32.050651550293   32.1492118835449 
##                  2                  2                  2                  2 
##   32.5489387512207   32.6646118164062    32.901439666748   33.6372299194336 
##                  2                  1                  1                  1 
##   33.6646118164062   33.8412094116211    33.901439666748   35.6372299194336 
##                  1                  1                  1                  1 
##   35.8535194396973    37.479118347168    38.479118347168   38.6865196228027 
##                  2                  1                  1                  1 
##   39.1581115722656   39.6865196228027                 40 
##                  1                  1                 13
fit2 <- lm(lwage ~ poly(potexp, degree = 2, raw = TRUE), data = data.frame(potexp,lwage))
fit4 <- lm(lwage ~ poly(potexp, degree = 4, raw = TRUE), data = data.frame(potexp,lwage))
potexp_seq <- seq(min(potexp), max(potexp), length.out = 100)
lwage_pred2 <- predict(fit2, newdata = data.frame(potexp = potexp_seq))
lwage_pred4 <- predict(fit4, newdata = data.frame(potexp = potexp_seq))

plot(0, type = "n", xlim = range(potexp_seq), ylim = range(c(lwage_pred2, lwage_pred4)),
     xlab = "Potential Experience", ylab = "log wage", main = "Life-Cycle Profiles with Polynomial Fits")
lines(potexp_seq, lwage_pred2, col = "red", lwd = 2, lty = 1, label = "2nd Order")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "label" ist kein
## Grafikparameter
lines(potexp_seq, lwage_pred4, col = "blue", lwd = 2, lty = 2, label = "4th Order")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "label" ist kein
## Grafikparameter
legend("bottomright", legend = c("2nd Order", "4th Order"), col = c("red", "blue"), lty = 2:3, cex = 0.8)

# Interpretation
#1. Ashenfelter and Krueger (1994)
# 1a) Research question and difficulties

#nAshenfelter and Krueger study the causal return to schooling, i.e. how much an additional year of education increases wages.
# The main difficulty is that schooling is endogenous: individuals who obtain more education may differ in unobserved ability, motivation, or family background, all of which also affect wages. As a result, simple OLS estimates of the return to education are likely biased due to ability bias, family background effects, and measurement error in reported schooling.

# 1b) Authors’ approach and credibility

# The authors exploit data on identical twins. Since twins share genetics and much of their family environment, comparing wages between twins with different schooling levels differences out many unobserved factors. Their main strategies are:

# Within-family (fixed-effects) estimation

# First-difference estimation

# Use of cross-reported education to address measurement error

# This approach is convincing to a large extent because it controls for major sources of omitted-variable bias. However, it does not fully eliminate concerns about twin-specific differences (e.g. health, motivation, birth order) or remaining measurement error. Overall, the strategy substantially improves upon standard OLS.

# 1c) Key results

# The key result is that the estimated return to schooling is about 7–10% per year, similar to or slightly lower than conventional OLS estimates. This is somewhat surprising, as one might expect OLS to be strongly upward biased. The results suggest that measurement error and ability bias may offset each other, making OLS estimates not wildly misleading. The results are convincing given the careful robustness checks.
######### Question 4.3 #########

#creating variable: age squared
twins_long$age_sq <- (twins_long$age*twins_long$age)

#creating a linear model
regr <- lm(lwage ~ educ + age + age_sq + male + white , data=twins_long)

summary(regr)
## 
## Call:
## lm(formula = lwage ~ educ + age + age_sq + male + white, data = twins_long)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.62602 -0.28748  0.00277  0.28474  2.42317 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.4706147  0.4260210  -1.105 0.270210    
## educ         0.0838714  0.0144261   5.814 1.60e-08 ***
## age          0.0878166  0.0188327   4.663 4.75e-06 ***
## age_sq      -0.0008686  0.0002335  -3.720 0.000239 ***
## male         0.2040259  0.0630223   3.237 0.001345 ** 
## white       -0.4104659  0.1266840  -3.240 0.001333 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5324 on 292 degrees of freedom
## Multiple R-squared:  0.2724, Adjusted R-squared:  0.2599 
## F-statistic: 21.86 on 5 and 292 DF,  p-value: < 2.2e-16
# Interpretation 
# 3a) Life-cycle profile

# The coefficient on age is positive and the coefficient on age² is negative, implying a concave life-cycle earnings profile: wages rise with age at a decreasing rate and eventually peak. This pattern closely matches the age–wage profiles plotted in Question 2.

# 3b) Comparison with Table 3 (Column 1)

# The estimated coefficients closely match Column (1) of Table 3 in Ashenfelter and Krueger. The OLS coefficient on education likely reflects:

# Upward bias from ability and family background

# Downward bias from measurement error in schooling

# The net bias is ambiguous, which helps explain why OLS estimates are similar to fixed-effect estimates.

######### Question 4.4 #########

coeftest(regr, vcov = vcovHC, type = "HC1")
## 
## t test of coefficients:
## 
##                Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept) -0.47061471  0.49575386 -0.9493 0.3432576    
## educ         0.08387137  0.01473366  5.6925 3.047e-08 ***
## age          0.08781660  0.01976842  4.4423 1.265e-05 ***
## age_sq      -0.00086864  0.00023652 -3.6726 0.0002855 ***
## male         0.20402595  0.06152734  3.3160 0.0010285 ** 
## white       -0.41046590  0.11653213 -3.5223 0.0004962 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coeftest(regr, vcov = vcovHC, cluster = ~ famid)
## 
## t test of coefficients:
## 
##                Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept) -0.47061471  0.51306812 -0.9173  0.359766    
## educ         0.08387137  0.01497900  5.5993 4.967e-08 ***
## age          0.08781660  0.02033895  4.3177 2.163e-05 ***
## age_sq      -0.00086864  0.00024452 -3.5524  0.000445 ***
## male         0.20402595  0.06213090  3.2838  0.001149 ** 
## white       -0.41046590  0.12500367 -3.2836  0.001149 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Interpretation 

# 4. Robust and clustered standard errors

# HC1 robust standard errors account for heteroskedasticity, while clustering at the family level accounts for within-family correlation in errors.
# Clustering typically increases standard errors, reducing statistical significance for some coefficients, but the education coefficient remains statistically significant. This adjustment is appropriate because twins within the same family are not independent observations.

######### Question 4.5 #########
table(educ)
## educ
##  8 10 11 12 13 14 15 16 17 18 19 20 
##  1  3  1 95 36 46 24 61  3 19  4  5
twins_long$educ_factor <- factor(twins_long$educ, levels = c(12, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20)) # 12=High-School will be the omitted category in the regression
table(twins_long$educ,twins_long$educ_factor)
##     
##      12  8 10 11 13 14 15 16 17 18 19 20
##   8   0  1  0  0  0  0  0  0  0  0  0  0
##   10  0  0  3  0  0  0  0  0  0  0  0  0
##   11  0  0  0  1  0  0  0  0  0  0  0  0
##   12 95  0  0  0  0  0  0  0  0  0  0  0
##   13  0  0  0  0 36  0  0  0  0  0  0  0
##   14  0  0  0  0  0 46  0  0  0  0  0  0
##   15  0  0  0  0  0  0 24  0  0  0  0  0
##   16  0  0  0  0  0  0  0 61  0  0  0  0
##   17  0  0  0  0  0  0  0  0  3  0  0  0
##   18  0  0  0  0  0  0  0  0  0 19  0  0
##   19  0  0  0  0  0  0  0  0  0  0  4  0
##   20  0  0  0  0  0  0  0  0  0  0  0  5
regr2 <- lm(lwage ~educ_factor+ age + age_sq + male + white, data=subset(twins_long))
summary(regr2)
## 
## Call:
## lm(formula = lwage ~ educ_factor + age + age_sq + male + white, 
##     data = subset(twins_long))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.60043 -0.29552 -0.00838  0.26326  2.51836 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.3647119  0.3987379   0.915 0.361149    
## educ_factor8   0.1776058  0.5535310   0.321 0.748554    
## educ_factor10 -0.3184596  0.3161341  -1.007 0.314627    
## educ_factor11  0.1898938  0.5400986   0.352 0.725408    
## educ_factor13  0.2910925  0.1074009   2.710 0.007133 ** 
## educ_factor14  0.2896466  0.0976197   2.967 0.003264 ** 
## educ_factor15  0.2938105  0.1227619   2.393 0.017350 *  
## educ_factor16  0.3610306  0.0896191   4.029 7.22e-05 ***
## educ_factor17  0.5328049  0.3151039   1.691 0.091963 .  
## educ_factor18  0.4776445  0.1362620   3.505 0.000530 ***
## educ_factor19  0.8128246  0.2739847   2.967 0.003268 ** 
## educ_factor20  0.8080972  0.2479446   3.259 0.001254 ** 
## age            0.0942462  0.0195404   4.823 2.32e-06 ***
## age_sq        -0.0009432  0.0002415  -3.906 0.000118 ***
## male           0.1954990  0.0649259   3.011 0.002839 ** 
## white         -0.4220351  0.1324176  -3.187 0.001598 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5352 on 282 degrees of freedom
## Multiple R-squared:   0.29,  Adjusted R-squared:  0.2522 
## F-statistic: 7.678 on 15 and 282 DF,  p-value: 2.272e-14
twins_long$coefficients <- coef(regr2)[twins_long$educ_factor]

twins_long$coefficients[twins_long$educ_factor == 12] <- 0

plot(twins_long$educ, twins_long$coefficients, xlim = range(educ),
xlab = "Years of Education", ylab = "log wage relative to HS=12", main = "Conditional Education--Wage Profiles 
(when Regression-Adjusting)", cex.lab=0.75, cex.main=0.85)

abline(lm(twins_long$coefficients ~ educ),col='red', lwd = 2, lty = 2)

# Interpretation 
 # Education dummies

#Replacing years of education with education-level dummies (omitting high school, educ = 12) shows that:

# Returns are approximately linear

# Some non-linearities exist at very high education levels (college and beyond)

# Overall, the pattern supports a roughly linear return to education, validating the linear specification as a reasonable approximation.

######### Question 4.6 #########

regr3 <- lm(lwage ~ educ + factor(famid), data = twins_long)

regr3 <- lm(lwage ~ educ + factor(famid) + factor(twin), data = twins_long)

summary(regr3)
## 
## Call:
## lm(formula = lwage ~ educ + factor(famid) + factor(twin), data = twins_long)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.016 -0.133  0.000  0.133  1.016 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.786296   0.471373   1.668 0.097425 .  
## educ              0.091569   0.023711   3.862 0.000168 ***
## factor(famid)2    0.284803   0.392052   0.726 0.468723    
## factor(famid)3    0.873150   0.403187   2.166 0.031953 *  
## factor(famid)4    0.436302   0.394732   1.105 0.270830    
## factor(famid)5    0.686161   0.394732   1.738 0.084252 .  
## factor(famid)6    0.580491   0.398276   1.458 0.147109    
## factor(famid)7    0.143165   0.403187   0.355 0.723037    
## factor(famid)8    0.851733   0.400564   2.126 0.035146 *  
## factor(famid)9    0.704329   0.403187   1.747 0.082742 .  
## factor(famid)10   0.510042   0.394732   1.292 0.198342    
## factor(famid)11   0.109504   0.398276   0.275 0.783745    
## factor(famid)12  -1.542640   0.391873  -3.937 0.000127 ***
## factor(famid)13   0.377758   0.398276   0.948 0.344440    
## factor(famid)14   0.577055   0.400564   1.441 0.151822    
## factor(famid)15   0.287605   0.392052   0.734 0.464367    
## factor(famid)16   0.577228   0.396331   1.456 0.147406    
## factor(famid)17  -0.073043   0.394732  -0.185 0.853449    
## factor(famid)18   0.625514   0.403187   1.551 0.122950    
## factor(famid)19   0.037265   0.392589   0.095 0.924507    
## factor(famid)20  -0.040834   0.403187  -0.101 0.919468    
## factor(famid)21   0.374854   0.393483   0.953 0.342329    
## factor(famid)22   0.692471   0.403187   1.717 0.087995 .  
## factor(famid)23   0.070782   0.391873   0.181 0.856911    
## factor(famid)24   0.471198   0.400564   1.176 0.241362    
## factor(famid)25   0.704329   0.403187   1.747 0.082742 .  
## factor(famid)26   0.517283   0.392589   1.318 0.189681    
## factor(famid)27   0.780732   0.392589   1.989 0.048593 *  
## factor(famid)28   0.123463   0.391873   0.315 0.753164    
## factor(famid)29   0.647664   0.403187   1.606 0.110340    
## factor(famid)30   0.443953   0.398276   1.115 0.266804    
## factor(famid)31   0.363248   0.391873   0.927 0.355470    
## factor(famid)32  -0.051435   0.403187  -0.128 0.898663    
## factor(famid)33   0.277613   0.392589   0.707 0.480603    
## factor(famid)34   0.955651   0.394732   2.421 0.016696 *  
## factor(famid)35  -0.812751   0.391873  -2.074 0.039822 *  
## factor(famid)36   1.027938   0.398276   2.581 0.010831 *  
## factor(famid)37   2.157194   0.393483   5.482 1.78e-07 ***
## factor(famid)38   0.384228   0.392052   0.980 0.328675    
## factor(famid)39   0.541602   0.394732   1.372 0.172129    
## factor(famid)40  -0.176165   0.403187  -0.437 0.662802    
## factor(famid)41   0.940748   0.403187   2.333 0.020989 *  
## factor(famid)42   0.005130   0.394732   0.013 0.989649    
## factor(famid)43   0.373865   0.394732   0.947 0.345123    
## factor(famid)44   0.258201   0.403187   0.640 0.522910    
## factor(famid)45   0.662423   0.394732   1.678 0.095441 .  
## factor(famid)46   0.847492   0.393483   2.154 0.032885 *  
## factor(famid)47   0.520621   0.396331   1.314 0.191027    
## factor(famid)48  -0.224229   0.398276  -0.563 0.574294    
## factor(famid)49  -1.032630   0.394732  -2.616 0.009824 ** 
## factor(famid)50   0.194212   0.391873   0.496 0.620916    
## factor(famid)51   0.539896   0.400564   1.348 0.179783    
## factor(famid)52   0.395290   0.396331   0.997 0.320221    
## factor(famid)53   0.684363   0.394732   1.734 0.085060 .  
## factor(famid)54   0.459628   0.392589   1.171 0.243590    
## factor(famid)55  -0.336669   0.398276  -0.845 0.399309    
## factor(famid)56   0.473476   0.403187   1.174 0.242160    
## factor(famid)57  -0.527922   0.394732  -1.337 0.183151    
## factor(famid)58   1.024986   0.392589   2.611 0.009968 ** 
## factor(famid)59  -0.064971   0.396331  -0.164 0.870011    
## factor(famid)60   0.184093   0.392589   0.469 0.639821    
## factor(famid)61  -1.027830   0.391873  -2.623 0.009638 ** 
## factor(famid)62   0.432291   0.392052   1.103 0.271987    
## factor(famid)63   1.107019   0.391873   2.825 0.005386 ** 
## factor(famid)64  -0.605801   0.398276  -1.521 0.130394    
## factor(famid)65   0.463140   0.392052   1.181 0.239382    
## factor(famid)66  -0.067488   0.403187  -0.167 0.867296    
## factor(famid)67   0.311401   0.403187   0.772 0.441148    
## factor(famid)68  -0.343842   0.392052  -0.877 0.381901    
## factor(famid)69   0.291112   0.392052   0.743 0.458948    
## factor(famid)70   1.140370   0.403187   2.828 0.005332 ** 
## factor(famid)71   0.047763   0.391873   0.122 0.903158    
## factor(famid)72  -0.186777   0.398276  -0.469 0.639790    
## factor(famid)73  -0.236218   0.392052  -0.603 0.547758    
## factor(famid)74   0.837028   0.398276   2.102 0.037292 *  
## factor(famid)75   0.385610   0.403187   0.956 0.340437    
## factor(famid)76   0.353982   0.396331   0.893 0.373237    
## factor(famid)77   0.260816   0.393483   0.663 0.508472    
## factor(famid)78   0.253920   0.394732   0.643 0.521050    
## factor(famid)79  -0.411142   0.394732  -1.042 0.299319    
## factor(famid)80   0.111484   0.394732   0.282 0.778011    
## factor(famid)81   0.050241   0.403187   0.125 0.901003    
## factor(famid)82   0.528519   0.392589   1.346 0.180298    
## factor(famid)83   0.105553   0.403187   0.262 0.793845    
## factor(famid)84  -0.321969   0.392052  -0.821 0.412840    
## factor(famid)85   0.241221   0.400564   0.602 0.547966    
## factor(famid)86   0.474020   0.391873   1.210 0.228363    
## factor(famid)87  -0.079979   0.403187  -0.198 0.843032    
## factor(famid)88   2.007199   0.394732   5.085 1.10e-06 ***
## factor(famid)89  -0.060573   0.392589  -0.154 0.877593    
## factor(famid)90   0.381172   0.400564   0.952 0.342868    
## factor(famid)91  -0.498119   0.394732  -1.262 0.208977    
## factor(famid)92  -0.377980   0.391873  -0.965 0.336355    
## factor(famid)93  -0.211121   0.396331  -0.533 0.595053    
## factor(famid)94   0.723924   0.394732   1.834 0.068681 .  
## factor(famid)95  -0.452334   0.396331  -1.141 0.255599    
## factor(famid)96   0.868657   0.392589   2.213 0.028465 *  
## factor(famid)97   0.632355   0.392589   1.611 0.109384    
## factor(famid)98   0.091748   0.400564   0.229 0.819150    
## factor(famid)99   0.249227   0.416898   0.598 0.550885    
## factor(famid)100  0.725133   0.394732   1.837 0.068225 .  
## factor(famid)101  0.411200   0.392052   1.049 0.295973    
## factor(famid)102  0.252883   0.396331   0.638 0.524427    
## factor(famid)103 -0.004189   0.394732  -0.011 0.991548    
## factor(famid)104  0.254737   0.403187   0.632 0.528494    
## factor(famid)105  0.708860   0.393483   1.801 0.073674 .  
## factor(famid)106  0.100320   0.396331   0.253 0.800527    
## factor(famid)107  0.101126   0.403187   0.251 0.802307    
## factor(famid)108 -0.140827   0.403187  -0.349 0.727375    
## factor(famid)109 -0.003312   0.398276  -0.008 0.993377    
## factor(famid)110  0.006086   0.398276   0.015 0.987828    
## factor(famid)111  0.733806   0.394732   1.859 0.065026 .  
## factor(famid)112  1.071615   0.392589   2.730 0.007116 ** 
## factor(famid)113 -0.271375   0.393483  -0.690 0.491487    
## factor(famid)114 -0.432740   0.392052  -1.104 0.271492    
## factor(famid)115  0.254328   0.398276   0.639 0.524095    
## factor(famid)116  0.441049   0.392589   1.123 0.263084    
## factor(famid)117  0.013053   0.403187   0.032 0.974217    
## factor(famid)118  0.206725   0.400564   0.516 0.606571    
## factor(famid)119  0.719739   0.394732   1.823 0.070280 .  
## factor(famid)120  0.265969   0.403187   0.660 0.510500    
## factor(famid)121  0.031592   0.403187   0.078 0.937651    
## factor(famid)122 -0.151560   0.391873  -0.387 0.699494    
## factor(famid)123  0.416948   0.392589   1.062 0.289957    
## factor(famid)124  0.036694   0.392589   0.093 0.925660    
## factor(famid)125  0.263411   0.403187   0.653 0.514570    
## factor(famid)126  0.850042   0.403187   2.108 0.036701 *  
## factor(famid)127  0.376795   0.406139   0.928 0.355059    
## factor(famid)128  0.247300   0.392589   0.630 0.529724    
## factor(famid)129 -0.798500   0.400564  -1.993 0.048064 *  
## factor(famid)130  1.071313   0.403187   2.657 0.008753 ** 
## factor(famid)131 -0.448921   0.403187  -1.113 0.267340    
## factor(famid)132  0.005419   0.400564   0.014 0.989225    
## factor(famid)133  0.496774   0.398276   1.247 0.214267    
## factor(famid)134  0.035385   0.394732   0.090 0.928693    
## factor(famid)135  0.612190   0.400564   1.528 0.128582    
## factor(famid)136 -0.216924   0.400564  -0.542 0.588950    
## factor(famid)137 -0.103905   0.391873  -0.265 0.791265    
## factor(famid)138 -0.283914   0.394732  -0.719 0.473124    
## factor(famid)139  0.405892   0.391873   1.036 0.302008    
## factor(famid)140  0.236819   0.403187   0.587 0.557858    
## factor(famid)141 -0.569069   0.398276  -1.429 0.155175    
## factor(famid)142  0.441675   0.398276   1.109 0.269256    
## factor(famid)143  0.903498   0.398276   2.269 0.024754 *  
## factor(famid)144 -0.466847   0.403187  -1.158 0.248785    
## factor(famid)145  1.033556   0.396331   2.608 0.010052 *  
## factor(famid)146  1.046223   0.403187   2.595 0.010421 *  
## factor(famid)147 -0.736406   0.398276  -1.849 0.066469 .  
## factor(famid)148  0.180535   0.392589   0.460 0.646298    
## factor(famid)149  0.156918   0.391873   0.400 0.689420    
## factor(twin)2     0.078593   0.045472   1.728 0.086023 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3919 on 147 degrees of freedom
## Multiple R-squared:  0.8015, Adjusted R-squared:  0.599 
## F-statistic: 3.958 on 150 and 147 DF,  p-value: 4.058e-16
# interpretation 
# Family fixed effects vs. OLS

# When adding family fixed effects, identification comes from differences in schooling within twin pairs. The education coefficient typically falls slightly relative to OLS.
# Economically, this reflects the removal of:

# Shared family background

# Genetic ability

# The remaining variation is arguably closer to the true causal effect of education.
######### Question 4.7 #########

#read in the wide data
twins <- read_dta("twins.dta")

#differences in education
twins$dif_edu <- (twins$educ1-twins$educ2)


#differences in wages
twins$dif_lwage <- (twins$lwage1-twins$lwage2)

#linear model for first differences
first_dif_regr <- lm(dif_lwage ~ dif_edu,  data = twins)


summary(first_dif_regr)
## 
## Call:
## lm(formula = dif_lwage ~ dif_edu, data = twins)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.03115 -0.20909  0.00722  0.34395  1.15740 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.07859    0.04547  -1.728 0.086023 .  
## dif_edu      0.09157    0.02371   3.862 0.000168 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5542 on 147 degrees of freedom
## Multiple R-squared:  0.09211,    Adjusted R-squared:  0.08593 
## F-statistic: 14.91 on 1 and 147 DF,  p-value: 0.0001682
# Interpretation
# The first-difference estimate is similar to the family fixed-effect estimate and close to Column (v) of Table 3.
# In theory, fixed effects and first differences should produce the same coefficient when there are exactly two observations per family. In practice, small differences arise due to sample restrictions and measurement error, but the results are very close, reinforcing the credibility of the findings.

# Overall conclusion

# Ashenfelter and Krueger provide strong evidence that the return to education is robust to controlling for family background and unobserved ability. Twin-based methods substantially strengthen causal interpretation, and the results suggest that conventional OLS estimates are not severely biased.