library(MASS)
library(tidyverse)
library(GGally)
library(labelled)
library(dplyr)
library(rstatix)
library(ivreg)
library(simstudy)
library(ivmodel)
library(ggplot2)
library(car)
library(lmtest)
library(tseries)
library(ggfortify)
library(plotly)
library(stargazer)
library(cobalt)

setwd("C:/Users/KaiHi/Documents/Uni/25_26-WS/Labor Econ/Problem_sets/PS2")

rm(list = ls())


####### Tasks ####### 
load("ps1_more_realistic_data.Rda") 

#a)
#Get summary statistics:
df2 %>% 
  get_summary_stats(
    education, hours_ME,wage_premium,  # columns to calculate for
    type = "common") 

cor(df2[,c(1,3,4)], use="pairwise", method="pearson")

#plots from 1.1 d)
density <- density(df2$ln_wage_ME)
fig <- plot_ly(x = ~density$x, y = ~density$y, type = 'scatter', mode = 'lines', fill = 'tozeroy')
fig <- fig %>% layout(xaxis = list(title = 'Log Wage'),
                      yaxis = list(title = 'Density'))
fig

plot_ly(data=df2, x = ~ln_wage_ME, y = ~density, name = 'Observations', type = 'scatter',mode = 'markers')

#b) Reg with Measurement Error (insignificant results)
OLS_ME <- lm(hours_ME ~ ln_wage_ME,data = df2) 
summary(OLS_ME)

# export table
stargazer(OLS_ME, 
          title = "Bivariate Regression: Hours Worked on Log Wage",
          label = "tab:regression",
          dep.var.labels = "Hours Worked",
          covariate.labels = "Log Wage",
          ci = TRUE,
          single.row = TRUE,
          header = FALSE)



## for comparison, also load the previous data
load("ps1_clean_data.Rda")
df1$ln_wage = log(df1$wage)
summary(lm(hours ~ ln_wage,data = df1))
# -> note that this is upward-biased due to OVB, as we are missing control variables

#c)

t.test(education ~ wage_premium, var.equal = TRUE, data = df2)

#d)

t.test(education ~ wage_premium, var.equal = TRUE, data = df1)

bal.tab(motivation ~ wage_premium, data = df1,un = TRUE, thresholds = c(m = 0.1))


#e)
first_stage = lm(ln_wage_ME ~ wage_premium,data = df2)
summary(first_stage)


# export table
stargazer(first_stage, 
          title = "first stage",
          label = "tab:regression",
          dep.var.labels = "Log Wage",
          covariate.labels = "wage premium",
          ci = TRUE,
          single.row = TRUE,
          header = FALSE)


ivmodel1 <- ivreg(hours_ME ~ ln_wage_ME | wage_premium, data = df2)
summary(ivmodel1)

# export table
stargazer(ivmodel1, 
          title = "IV",
          label = "tab:regression",
          dep.var.labels = "hours worked",
          covariate.labels = "log wage",
          ci = TRUE,
          single.row = TRUE,
          header = FALSE)




#f) some text

#g) 
first_stage = lm(ln_wage_ME ~ wage_premium+education,data = df2)
summary(first_stage)

ivmodel2 <- ivreg(hours_ME ~ ln_wage_ME+education | wage_premium+education, data = df2)
summary(ivmodel2)

stargazer(first_stage, ivmodel2,
          title = "First Stage and IV Model Results",
          label = "tab:regression",
          header = FALSE,
          column.labels = c("First Stage (OLS)", "IV Model (2SLS)"),
          dep.var.labels = c("Log Wage", "Hours Worked"),
          covariate.labels = c("Wage Premium (Instrument)", "Log Wage", "Education"),
          ci = TRUE,
          single.row = TRUE
)

#h)
# Reg with all variables
summary(lm(hours ~ ln_wage + education + motivation ,data = df1))

coefficients <- coef(lm(hours ~ ln_wage + education + motivation, data = df1))

# mean of the outcome variable
average_hours <- mean(df1$hours)

# Average effect 
average_effect_ln_wage <- coefficients['ln_wage'] / average_hours
average_effect_education <- coefficients['education'] / average_hours
average_effect_motivation <- coefficients['motivation'] / average_hours

# print effects
cat("Average effect of ln_wage:",average_effect_ln_wage, "\n")
cat("Average effect of education:", average_effect_education, "\n")
cat("Average effect of motivation:", average_effect_motivation, "\n")