library(MASS)
library(tidyverse)
library(GGally)
library(labelled)
library(dplyr)
library(rstatix)
library(ivreg)
library(simstudy)
library(ivmodel)
library(ggplot2)
library(car)
library(lmtest)
library(tseries)
library(ggfortify)
library(plotly)
library(xtable)
library(stargazer)

setwd("/Users/andreasluck/Desktop/Labor-Abgaben")

rm(list = ls())


####### Task 1.a.) ####### 
# Generate log wages (ln wage) and interpret a table with descriptive statistics for Education, Motivation, 
# hours and ln wage. Also calculate correlations and plot the density of wages as well as a histogram 
# for the years of education.

load("ps1_clean_data.Rda") 

#a)
# df1 stands for dataframe 1. generate the log of wage.
df1$ln_wage = log(df1$wage)
table(df1$wage_premium)

#Drop the individuals with wage_premium.
#df1 <- subset(df1, wage_premium != 1)

#b)

des_summary <-get_summary_stats(
  df1,
  education, motivation, hours, ln_wage,  # columns to calculate for
  type = "common") 
LATEX_summary <- xtable(des_summary)
print(LATEX_summary, type="latex", include.rownames = FALSE)


#Get correlations
correlation <-cor(df1[,c(1,2,5,6)], use="pairwise", method="pearson")
LATEX_cor <- xtable(correlation)
print(LATEX_cor, type="latex", include.rownames = FALSE)

#Graph
density <- density(df1$ln_wage)
fig <- plot_ly(x = ~density$x, y = ~density$y, type = 'scatter', mode = 'lines', fill = 'tozeroy')
fig <- fig %>% layout(xaxis = list(title = 'Log Wage'),
                      yaxis = list(title = 'Density'))
fig

fig2 <- plot_ly(x = ~df1$education, type = "histogram",alpha = 0.6)%>% layout(bargap=0.1,xaxis =list(dtick = 1,tickmode = "linear"))
fig2 <- fig2 %>% layout(xaxis = list(title = 'Years of education'),
                        yaxis = list(title = 'Frequency'))
fig2

#b)
#Create dummy

df1$M <- ifelse(df1$motivation >=0, 1, 0)

#descriptive Motivation
summaryB <-df1 %>% 
  group_by(M) %>%
  get_summary_stats(
    education, motivation, hours, ln_wage,  # columns to calculate for
    type = "common") 

LATEX_summaryB <- xtable(summaryB)
print(LATEX_summaryB, type="latex", include.rownames = FALSE)

#descriptive education
df1$E <- ifelse(df1$education >=14, 1, 0)

summaryBB <- df1 %>% 
  group_by(E) %>%
  get_summary_stats(
    education, motivation, hours, ln_wage,  # columns to calculate for
    type = "common") 

LATEX_summaryBB <- xtable(summaryBB)
print(LATEX_summaryBB, type="latex", include.rownames = FALSE)

#c)
plot_ly(data=df1, x = ~ln_wage, y = ~hours, name = 'Observations', type = 'scatter',mode = 'markers')

####### Regressions ####### 

#d)
#Bivariate Regression
simple_OLS <- lm(hours ~ ln_wage,data = df1)
summary(simple_OLS)

#Confidence intervals 
conf_interval <- confint(simple_OLS)
Std_error_wage <- sqrt(diag(vcov(simple_OLS)))[2]
c("lower (2.5%)" = simple_OLS$coef[2] - qt(0.975, df = simple_OLS$df) * Std_error_wage,
  "upper (97.5%)" = simple_OLS$coef[2] + qt(0.975, df = simple_OLS$df) * Std_error_wage)

df1$hours_hat = fitted(simple_OLS) 
df1$res = simple_OLS$residuals

#Plot of x, y and the regression line
plot_ly(data = df1, x = ~ln_wage, y = ~hours, name = 'Observations', type = 'scatter', mode = 'markers') %>%
  add_trace(data = df1, y = ~hours_hat, name = 'OLS', mode = 'lines') %>%
  add_ribbons(data = df1, ymin = ~predict(simple_OLS, interval = "confidence")[, "lwr"],
              ymax = ~predict(simple_OLS, interval = "confidence")[, "upr"],
              name = "Confidence 95%",
              line = list(color = 'transparent'), 
              fillcolor = 'rgba(0,100,80,0.2)',
              showlegend = TRUE)

#e)
#Adding education to the regression
simple_OLS_education <-lm(hours ~ ln_wage + education,data = df1)
summary(simple_OLS_education)
# Regressionsergebnisse als LaTeX ausgeben
stargazer(simple_OLS_education, type = "latex", title = "simple OLS education", 
          dep.var.labels = "hours", 
          covariate.labels = c("lnwage", "education"),
          out = "regression_table.tex") 

#f)
# Reg with all variables
simple_OLS_education_motivation <- lm(hours ~ ln_wage + education + motivation ,data = df1)
summary(simple_OLS_education_motivation)
# Regressionsergebnisse als LaTeX ausgeben
stargazer(simple_OLS_education_motivation, type = "latex", title = "simple OLS motivation", 
          dep.var.labels = "hours", 
          covariate.labels = c("lnwage","education", "motivation"),
          out = "regression_table.tex")