library(MASS)
library(tidyverse)
library(GGally)
library(labelled)
library(dplyr)
library(rstatix)
library(ivreg)
library(simstudy)
library(ivmodel)
library(ggplot2)
library(car)
library(lmtest)
library(tseries)
library(ggfortify)
library(plotly)


#setwd("____")

rm(list = ls())


####### Tasks ####### 
#load("ps1_more_realistic_data.Rda") 

#a)
#Get summary statistics:
df2 %>% 
  get_summary_stats(
    education, hours_ME, ln_wage_ME,  # columns to calculate for
    type = "common") 

#plots from 1.1 d)
density <- density(df2$ln_wage_ME)
fig <- plot_ly(x = ~density$x, y = ~density$y, type = 'scatter', mode = 'lines', fill = 'tozeroy')
fig <- fig %>% layout(xaxis = list(title = 'Log Wage'),
                      yaxis = list(title = 'Density'))
fig

plot_ly(data=df2, x = ~ln_wage_ME, y = ~hours_ME, name = 'Observations', type = 'scatter',mode = 'markers')

#b) Reg with Measurement Error (insignificant results)
summary(lm(hours_ME ~ ln_wage_ME,data = df2))

## for comparison, also load the previous data
#load("ps1_clean_data.Rda")
df1$ln_wage = log(df1$wage)
summary(lm(hours ~ ln_wage,data = df1))
# -> note that this is upward-biased due to OVB, as we are missing control variables

#c)

t.test(ln_wage_ME ~ wage_premium, var.equal = TRUE, data = df2)

#d)

t.test(motivation ~ wage_premium, var.equal = TRUE, data = df1)

#e)
first_stage = lm(ln_wage_ME ~ wage_premium,data = df2)
summary(first_stage)

ivmodel1 <- ivreg(hours_ME ~ ln_wage_ME | wage_premium, data = df2)
summary(ivmodel1)

#f) some text

#g) 
first_stage = lm(ln_wage_ME ~ wage_premium+education,data = df2)
summary(first_stage)

ivmodel2 <- ivreg(hours_ME ~ ln_wage_ME+education | wage_premium+education, data = df2)
summary(ivmodel2)

#h)
# Reg with all variables
summary(lm(hours ~ ln_wage + education + motivation ,data = df1))

coefficients <- coef(lm(hours ~ ln_wage + education + motivation, data = df1))

# mean of the outcome variable
average_hours <- mean(df1$hours)

# Average effect 
average_effect_ln_wage <- coefficients['ln_wage'] / average_hours
average_effect_education <- coefficients['education'] / average_hours
average_effect_motivation <- coefficients['motivation'] / average_hours

# print effects
cat("Average effect of ln_wage:", average_effect_ln_wage, "\n")
cat("Average effect of education:", average_effect_education, "\n")
cat("Average effect of motivation:", average_effect_motivation, "\n")