###############################################################
#Labour Economics PROBLEM SET 1 – DATA ANALYSIS IN R
#Tasks (a)–(f): Using ps1_clean_data.Rda
#uthor: Bademba Drammeh
 #Date:
###############################################################
# --- SETUP ---
# Clear workspace
rm(list = ls())

# Load required libraries
#library(psych)       # For descriptive stats
#library(ggplot2)     # For plotting
#library(stargazer)   # For regression tables (optional)
#ibrary(tinytex)
# --- LOAD DATA ---
# 1. Set working directory to the folder (not the file)
#setwd("C:/Users/bdrammeh/3D Objects/Desktop/winter25-26/Labour Economics/Assginments")
setwd("C:/Users/bdrammeh/Desktop/Labour Economics/assignmnt")
load("ps1_clean_data.Rda")   # Ensure the file is in your working directory

# Check which object was loaded
ls()
## [1] "df1"
# If the dataset isn't called df1, rename it accordingly
# Example: if you see "ps1_clean_data" when you run ls()
#df1 <- ps1_clean_data
# --- (a) Generate log of wage and descriptive statistics ---
df1$ln_wage <- log(df1$wage)

# Remove missing or invalid values
df1 <- subset(df1, is.finite(ln_wage) & wage > 0)

# Descriptive statistics
#install.packages("pysch")
library(psych)
## Warning: Paket 'psych' wurde unter R Version 4.4.3 erstellt
cat("### Descriptive Statistics\n")
## ### Descriptive Statistics
describe(df1[, c("education", "motivation", "hours", "ln_wage")])
##            vars    n  mean   sd median trimmed  mad   min   max range skew
## education     1 5000 12.77 2.42  12.00   12.52 2.97 10.00 24.00 14.00 0.67
## motivation    2 5000  0.01 1.39   0.00    0.01 1.39 -4.56  5.14  9.71 0.00
## hours         3 5000  7.95 0.49   7.95    7.95 0.49  6.35  9.70  3.35 0.13
## ln_wage       4 5000  3.36 0.23   3.35    3.36 0.23  2.37  4.12  1.75 0.00
##            kurtosis   se
## education     -0.15 0.03
## motivation    -0.10 0.02
## hours         -0.10 0.01
## ln_wage       -0.12 0.00
# Correlation matrix
cat("\n### Correlations\n")
## 
## ### Correlations
print(cor(df1[, c("education", "motivation", "hours", "ln_wage")], use = "complete.obs"))
##            education motivation     hours   ln_wage
## education  1.0000000  0.1416336 0.6463031 0.5134663
## motivation 0.1416336  1.0000000 0.8154698 0.3538311
## hours      0.6463031  0.8154698 1.0000000 0.6042621
## ln_wage    0.5134663  0.3538311 0.6042621 1.0000000
# Density plot of log wages
#install.packages("ggplot2")
library(ggplot2)
## Warning: Paket 'ggplot2' wurde unter R Version 4.4.3 erstellt
## 
## Attache Paket: 'ggplot2'
## Die folgenden Objekte sind maskiert von 'package:psych':
## 
##     %+%, alpha
ggplot(df1, aes(x = ln_wage)) +
  geom_density(fill = "skyblue", alpha = 0.6) +
  labs(title = "Density of Log(Wage)", x = "Log(Wage)", y = "Density")

# Histogram of education
ggplot(df1, aes(x = education)) +
  geom_histogram(fill = "lightgreen", color = "black", bins = 20) +
  labs(title = "Histogram of Years of Education", x = "Years of Education", y = "Count")

# Interpretation:

#On average, individuals have about 12.8 years of education — roughly a high school level.

#Motivation is centered around zero (likely standardized).

#Hours worked per day average about 8 hours, suggesting full-time work.

#Wages (in log form) have a roughly symmetric distribution, as the density plot shows a smooth bell-shaped curve.

#Correlations:

#Education, motivation, and hours are all positively correlated with ln(wage).

#The strongest relationship is between hours and motivation (0.82) and education and hours (0.65), implying that more motivated and educated individuals tend to work longer hours.

#No variable shows problematic (perfect) multicollinearity at this stage.
# --- (b) Compare descriptive statistics by motivation and education groups ---

# Motivation groups
df1$motivation_group <- ifelse(df1$motivation >= 0, "High (≥0)", "Low (<0)")
cat("\n### Means by Motivation Group\n")
## 
## ### Means by Motivation Group
print(aggregate(df1[, c("hours", "education", "ln_wage")],
                by = list(Motivation = df1$motivation_group), FUN = mean, na.rm = TRUE))
##   Motivation    hours education  ln_wage
## 1  High (≥0) 8.275187  13.05602 3.425180
## 2   Low (<0) 7.630419  12.47701 3.294857
# Education groups
df1$edu_group <- ifelse(df1$education >= 14, "College or more", "Below college")
cat("\n### Means by Education Group\n")
## 
## ### Means by Education Group
print(aggregate(df1[, c("hours", "motivation", "ln_wage")],
                by = list(Education = df1$edu_group), FUN = mean, na.rm = TRUE))
##         Education    hours  motivation ln_wage
## 1   Below college 7.755565 -0.09976975 3.28440
## 2 College or more 8.303090  0.21561040 3.49438
# Interpretation:

#Individuals with higher motivation work about 0.65 more hours and earn higher wages.

#Those with college education or more work about 0.55 more hours and earn significantly higher wages than those with less education.

#Both motivation and education are positively associated with hours and wages — consistent with standard labor economics theory.
# --- (c) Scatter plot: hours vs ln_wage ---
ggplot(df1, aes(x = ln_wage, y = hours)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  labs(title = "Hours Worked vs Log(Wage)",
       x = "Log(Wage)", y = "Hours Worked per Day")

# Interpretation:

#The scatter plot shows a positive relationship between log(wage) and hours.

#Individuals with higher wages tend to work slightly longer hours.

#The relationship is not perfectly linear, but the upward trend is clear.
# --- (d) Simple regression: hours on ln_wage ---
model1 <- lm(hours ~ ln_wage, data = df1)
cat("\n### Model 1: Simple Regression (hours ~ ln_wage)\n")
## 
## ### Model 1: Simple Regression (hours ~ ln_wage)
summary(model1)
## 
## Call:
## lm(formula = hours ~ ln_wage, data = df1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.33182 -0.27013  0.00022  0.26766  1.47676 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.63582    0.08071   45.05   <2e-16 ***
## ln_wage      1.28478    0.02396   53.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.39 on 4998 degrees of freedom
## Multiple R-squared:  0.3651, Adjusted R-squared:  0.365 
## F-statistic:  2875 on 1 and 4998 DF,  p-value: < 2.2e-16
# Scatter with regression line and 95% CI
ggplot(df1, aes(x = ln_wage, y = hours)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "Hours vs Log(Wage) with Regression Line",
    x = "Log(Wage)", y = "Hours Worked per Day")
## `geom_smooth()` using formula = 'y ~ x'

# Interpretation:

#The coefficient (1.285) suggests that a 1-unit increase in ln(wage) (roughly a 100% wage increase) is associated with 1.29 more hours worked per day.

#The relationship is positive and highly significant.

#The model explains about 36.5% of the variation in hours worked — moderately strong for a single-variable regression.
# --- (e) Regression with education added ---
model2 <- lm(hours ~ ln_wage + education, data = df1)
cat("\n### Model 2: hours ~ ln_wage + education\n")
## 
## ### Model 2: hours ~ ln_wage + education
summary(model2)
## 
## Call:
## lm(formula = hours ~ ln_wage + education, data = df1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.24877 -0.23175  0.00178  0.23356  1.29310 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.133660   0.071392   57.90   <2e-16 ***
## ln_wage     0.786568   0.024323   32.34   <2e-16 ***
## education   0.092129   0.002309   39.89   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3397 on 4997 degrees of freedom
## Multiple R-squared:  0.5185, Adjusted R-squared:  0.5183 
## F-statistic:  2690 on 2 and 4997 DF,  p-value: < 2.2e-16
# Interpretation:

#Once we control for education, the coefficient on ln(wage) drops from 1.285 → 0.787.

#This means that part of the positive effect of wages on hours was actually due to education, since more educated people both earn more and work more.

#Education has an independent, positive effect: each extra year of schooling increases hours by about 0.09 hours (≈5.5 minutes) per day.
# --- (f) Full regression: add motivation ---
model3 <- lm(hours ~ ln_wage + education + motivation, data = df1)
cat("\n### Model 3: hours ~ ln_wage + education + motivation\n")
## 
## ### Model 3: hours ~ ln_wage + education + motivation
summary(model3)
## 
## Call:
## lm(formula = hours ~ ln_wage + education + motivation, data = df1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.36675 -0.06728  0.00121  0.06598  0.33396 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.9681690  0.0223144  267.46   <2e-16 ***
## ln_wage     0.2099211  0.0075306   27.88   <2e-16 ***
## education   0.0999287  0.0006756  147.91   <2e-16 ***
## motivation  0.2499828  0.0010803  231.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09925 on 4996 degrees of freedom
## Multiple R-squared:  0.9589, Adjusted R-squared:  0.9589 
## F-statistic: 3.886e+04 on 3 and 4996 DF,  p-value: < 2.2e-16
# --- Optional: Regression comparison table ---
#install.packages("stargazer")
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(model1, model2, model3, type = "text",
          title = "Regression Results Comparison",
          dep.var.labels = "Hours Worked",
          covariate.labels = c("Log(Wage)", "Education (Years)", "Motivation"))
## 
## Regression Results Comparison
## ========================================================================================================
##                                                     Dependent variable:                                 
##                     ------------------------------------------------------------------------------------
##                                                         Hours Worked                                    
##                                 (1)                         (2)                         (3)             
## --------------------------------------------------------------------------------------------------------
## Log(Wage)                    1.285***                    0.787***                     0.210***          
##                               (0.024)                     (0.024)                     (0.008)           
##                                                                                                         
## Education (Years)                                        0.092***                     0.100***          
##                                                           (0.002)                     (0.001)           
##                                                                                                         
## Motivation                                                                            0.250***          
##                                                                                       (0.001)           
##                                                                                                         
## Constant                     3.636***                    4.134***                     5.968***          
##                               (0.081)                     (0.071)                     (0.022)           
##                                                                                                         
## --------------------------------------------------------------------------------------------------------
## Observations                   5,000                       5,000                       5,000            
## R2                             0.365                       0.518                       0.959            
## Adjusted R2                    0.365                       0.518                       0.959            
## Residual Std. Error      0.390 (df = 4998)           0.340 (df = 4997)           0.099 (df = 4996)      
## F Statistic         2,874.511*** (df = 1; 4998) 2,690.303*** (df = 2; 4997) 38,858.290*** (df = 3; 4996)
## ========================================================================================================
## Note:                                                                        *p<0.1; **p<0.05; ***p<0.01
# Interpretation:

# The fit improves dramatically (R² ≈ 0.96), meaning the model now explains almost all variation in hours worked.

# The effect of ln(wage) further decreases (0.21), showing that once we control for motivation and education, the direct wage–hours link is weaker.

# Motivation strongly affects hours: a one-unit increase in motivation increases hours by 0.25 hours (~15 minutes).


#Preferred Model:

#Model 3 is the most comprehensive and theoretically sound because:

#It includes key determinants of labor supply (education, motivation, wage).

#It explains most of the variation in hours.

#Coefficients are significant and directionally consistent with economic intuition.

```