library(MASS)
library(tidyverse)
library(GGally)
library(labelled)
library(dplyr)
library(rstatix)
library(ivreg)
library(simstudy)
library(ivmodel)
library(ggplot2)
library(car)
library(lmtest)
library(tseries)
library(ggfortify)
library(plotly)

setwd("/Users/michaelboehm/sciebo/EWF/Lehre/Labour-Economics/WS2025-26/assignments/Assignment1--Labour Supply Simulated")
#setwd("/Users/maxjanhofer/sciebo - Janhofer, Max (max.janhofer@tu-dortmund.de)@tu-dortmund.sciebo.de/Labour-Economics/WS2025-26/assignments/Assignment1--Labour Supply Simulated")

rm(list = ls())


####### Task 1.a.) ####### 
# Generate log wages (ln wage) and interpret a table with descriptive statistics for Education, Motivation, 
# hours and ln wage. Also calculate correlations and plot the density of wages as well as a histogram 
# for the years of education.

load("ps1_clean_data.Rda") 

#a)
# df1 stands for dataframe 1. generate the log of wage.
df1$ln_wage = log(df1$wage)
table(df1$wage_premium)

#Get summary statistics: (red font for negative values)
df1 %>% 
  get_summary_stats(
    education, motivation, hours, ln_wage,  # columns to calculate for
    type = "common") 

#alternative to piping operator %>% 
get_summary_stats(
  df1,
  education, motivation, hours, ln_wage,  # columns to calculate for
  type = "common") 

#Alternative:
summary(df1$motivation)
summary(df1$education)
summary(df1$hours)
summary(df1$ln_wage)

#Get correlations
round(cor(df1[,c(1,2,5,6)], use="pairwise", method="pearson"), digits = 4)

#Some Graphs 
density <- density(df1$ln_wage)
fig <- plot_ly(x = ~density$x, y = ~density$y, type = 'scatter', mode = 'lines', fill = 'tozeroy')
fig <- fig %>% layout(xaxis = list(title = 'Log Wage'),
                      yaxis = list(title = 'Density'))
fig

fig2 <- plot_ly(x = ~df1$education, type = "histogram",alpha = 0.6)%>% layout(bargap=0.1,xaxis =list(dtick = 1,tickmode = "linear"))
fig2 <- fig2 %>% layout(xaxis = list(title = 'Years of education'),
                      yaxis = list(title = 'Frequency'))
fig2
#If you want to export your graphs as .pdf:
#use the export button in your Viewer, then go to save as image and save it.
#Then you have to use an pdf writer such as adobe pdf writer and create an pdf with another file, here we want this file to be our image we safed earlier.
#then you can use this .pdf for example to paste it into your LaTex document.

#b)
#Get summary statistics depending on M:

#Create dummy
df1$M <- ifelse(df1$motivation >=0, 1, 0)

df1 %>% 
  group_by(M) %>%
  get_summary_stats(
    education, motivation, hours, ln_wage,  # columns to calculate for
    type = "common") 

#Alternative:
#Motivation below zero
summary(df1$education[df1$M==0])
summary(df1$hours[df1$M==0])
summary(df1$ln_wage[df1$M==0])

#Motivation above or equal to zero
summary(df1$education[df1$M==1])
summary(df1$hours[df1$M==1])
summary(df1$ln_wage[df1$M==1])

df1$E <- ifelse(df1$education >=14, 1, 0)

df1 %>% 
  group_by(E) %>%
  get_summary_stats(
    education, motivation, hours, ln_wage,  # columns to calculate for
    type = "common") 

#c)
plot_ly(data=df1, x = ~ln_wage, y = ~hours, name = 'Observations', type = 'scatter',mode = 'markers')

####### Regressions ####### 

#d)
#Bivariate Regression
simple_OLS <- lm(hours ~ ln_wage,data = df1)
summary(simple_OLS)

#Confidence intervals 
conf_interval <- confint(simple_OLS)
Std_error_wage <- sqrt(diag(vcov(simple_OLS)))[2]
c("lower (2.5%)" = simple_OLS$coef[2] - qt(0.975, df = simple_OLS$df) * Std_error_wage,
  "upper (97.5%)" = simple_OLS$coef[2] + qt(0.975, df = simple_OLS$df) * Std_error_wage)


df1$hours_hat = fitted(simple_OLS) 
df1$res = simple_OLS$residuals

#Plot of x, y and the regression line
  plot_ly(data = df1, x = ~ln_wage, y = ~hours, name = 'Observations', type = 'scatter', mode = 'markers') %>%
    add_trace(data = df1, y = ~hours_hat, name = 'OLS', mode = 'lines') %>%
    add_ribbons(data = df1, ymin = ~predict(simple_OLS, interval = "confidence")[, "lwr"],
                ymax = ~predict(simple_OLS, interval = "confidence")[, "upr"],
                name = "Confidence 95%",
                line = list(color = 'transparent'), 
                fillcolor = 'rgba(0,100,80,0.2)',
                showlegend = TRUE)
#e)
#Adding education to the regression
simple_OLS_education <-lm(hours ~ ln_wage + education,data = df1)
summary(simple_OLS_education)

#f)
# Reg with all variables
simple_OLS_education_motivation <- lm(hours ~ ln_wage + education + motivation ,data = df1)
summary(simple_OLS_education_motivation)