###############################################################
#Labour Economics PROBLEM SET 1 – DATA ANALYSIS IN R
#Tasks (a)–(f): Using ps1_clean_data.Rda
#uthor: Bademba Drammeh
#Date:
###############################################################
# --- SETUP ---
# Clear workspace
rm(list = ls())
# Load required libraries
#library(psych) # For descriptive stats
#library(ggplot2) # For plotting
#library(stargazer) # For regression tables (optional)
#ibrary(tinytex)
# --- LOAD DATA ---
# 1. Set working directory to the folder (not the file)
#setwd("C:/Users/bdrammeh/3D Objects/Desktop/winter25-26/Labour Economics/Assginments")
setwd("C:/Users/bdrammeh/Desktop/Labour Economics/assignmnt")
load("ps1_clean_data.Rda") # Ensure the file is in your working directory
# Check which object was loaded
ls()
## [1] "df1"
# If the dataset isn't called df1, rename it accordingly
# Example: if you see "ps1_clean_data" when you run ls()
#df1 <- ps1_clean_data
# --- (a) Generate log of wage and descriptive statistics ---
df1$ln_wage <- log(df1$wage)
# Remove missing or invalid values
df1 <- subset(df1, is.finite(ln_wage) & wage > 0)
# Descriptive statistics
#install.packages("pysch")
library(psych)
## Warning: Paket 'psych' wurde unter R Version 4.4.3 erstellt
cat("### Descriptive Statistics\n")
## ### Descriptive Statistics
describe(df1[, c("education", "motivation", "hours", "ln_wage")])
## vars n mean sd median trimmed mad min max range skew
## education 1 5000 12.77 2.42 12.00 12.52 2.97 10.00 24.00 14.00 0.67
## motivation 2 5000 0.01 1.39 0.00 0.01 1.39 -4.56 5.14 9.71 0.00
## hours 3 5000 7.95 0.49 7.95 7.95 0.49 6.35 9.70 3.35 0.13
## ln_wage 4 5000 3.36 0.23 3.35 3.36 0.23 2.37 4.12 1.75 0.00
## kurtosis se
## education -0.15 0.03
## motivation -0.10 0.02
## hours -0.10 0.01
## ln_wage -0.12 0.00
# Correlation matrix
cat("\n### Correlations\n")
##
## ### Correlations
print(cor(df1[, c("education", "motivation", "hours", "ln_wage")], use = "complete.obs"))
## education motivation hours ln_wage
## education 1.0000000 0.1416336 0.6463031 0.5134663
## motivation 0.1416336 1.0000000 0.8154698 0.3538311
## hours 0.6463031 0.8154698 1.0000000 0.6042621
## ln_wage 0.5134663 0.3538311 0.6042621 1.0000000
# Density plot of log wages
#install.packages("ggplot2")
library(ggplot2)
## Warning: Paket 'ggplot2' wurde unter R Version 4.4.3 erstellt
##
## Attache Paket: 'ggplot2'
## Die folgenden Objekte sind maskiert von 'package:psych':
##
## %+%, alpha
ggplot(df1, aes(x = ln_wage)) +
geom_density(fill = "skyblue", alpha = 0.6) +
labs(title = "Density of Log(Wage)", x = "Log(Wage)", y = "Density")
# Histogram of education
ggplot(df1, aes(x = education)) +
geom_histogram(fill = "lightgreen", color = "black", bins = 20) +
labs(title = "Histogram of Years of Education", x = "Years of Education", y = "Count")
# Interpretation:
#On average, individuals have about 12.8 years of education — roughly a high school level.
#Motivation is centered around zero (likely standardized).
#Hours worked per day average about 8 hours, suggesting full-time work.
#Wages (in log form) have a roughly symmetric distribution, as the density plot shows a smooth bell-shaped curve.
#Correlations:
#Education, motivation, and hours are all positively correlated with ln(wage).
#The strongest relationship is between hours and motivation (0.82) and education and hours (0.65), implying that more motivated and educated individuals tend to work longer hours.
#No variable shows problematic (perfect) multicollinearity at this stage.
# --- (b) Compare descriptive statistics by motivation and education groups ---
# Motivation groups
df1$motivation_group <- ifelse(df1$motivation >= 0, "High (≥0)", "Low (<0)")
cat("\n### Means by Motivation Group\n")
##
## ### Means by Motivation Group
print(aggregate(df1[, c("hours", "education", "ln_wage")],
by = list(Motivation = df1$motivation_group), FUN = mean, na.rm = TRUE))
## Motivation hours education ln_wage
## 1 High (≥0) 8.275187 13.05602 3.425180
## 2 Low (<0) 7.630419 12.47701 3.294857
# Education groups
df1$edu_group <- ifelse(df1$education >= 14, "College or more", "Below college")
cat("\n### Means by Education Group\n")
##
## ### Means by Education Group
print(aggregate(df1[, c("hours", "motivation", "ln_wage")],
by = list(Education = df1$edu_group), FUN = mean, na.rm = TRUE))
## Education hours motivation ln_wage
## 1 Below college 7.755565 -0.09976975 3.28440
## 2 College or more 8.303090 0.21561040 3.49438
# Interpretation:
#Individuals with higher motivation work about 0.65 more hours and earn higher wages.
#Those with college education or more work about 0.55 more hours and earn significantly higher wages than those with less education.
#Both motivation and education are positively associated with hours and wages — consistent with standard labor economics theory.
# --- (c) Scatter plot: hours vs ln_wage ---
ggplot(df1, aes(x = ln_wage, y = hours)) +
geom_point(alpha = 0.5, color = "steelblue") +
labs(title = "Hours Worked vs Log(Wage)",
x = "Log(Wage)", y = "Hours Worked per Day")
# Interpretation:
#The scatter plot shows a positive relationship between log(wage) and hours.
#Individuals with higher wages tend to work slightly longer hours.
#The relationship is not perfectly linear, but the upward trend is clear.
# --- (d) Simple regression: hours on ln_wage ---
model1 <- lm(hours ~ ln_wage, data = df1)
cat("\n### Model 1: Simple Regression (hours ~ ln_wage)\n")
##
## ### Model 1: Simple Regression (hours ~ ln_wage)
summary(model1)
##
## Call:
## lm(formula = hours ~ ln_wage, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.33182 -0.27013 0.00022 0.26766 1.47676
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.63582 0.08071 45.05 <2e-16 ***
## ln_wage 1.28478 0.02396 53.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.39 on 4998 degrees of freedom
## Multiple R-squared: 0.3651, Adjusted R-squared: 0.365
## F-statistic: 2875 on 1 and 4998 DF, p-value: < 2.2e-16
# Scatter with regression line and 95% CI
ggplot(df1, aes(x = ln_wage, y = hours)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = "Hours vs Log(Wage) with Regression Line",
x = "Log(Wage)", y = "Hours Worked per Day")
## `geom_smooth()` using formula = 'y ~ x'
# Interpretation:
#The coefficient (1.285) suggests that a 1-unit increase in ln(wage) (roughly a 100% wage increase) is associated with 1.29 more hours worked per day.
#The relationship is positive and highly significant.
#The model explains about 36.5% of the variation in hours worked — moderately strong for a single-variable regression.
# --- (e) Regression with education added ---
model2 <- lm(hours ~ ln_wage + education, data = df1)
cat("\n### Model 2: hours ~ ln_wage + education\n")
##
## ### Model 2: hours ~ ln_wage + education
summary(model2)
##
## Call:
## lm(formula = hours ~ ln_wage + education, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.24877 -0.23175 0.00178 0.23356 1.29310
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.133660 0.071392 57.90 <2e-16 ***
## ln_wage 0.786568 0.024323 32.34 <2e-16 ***
## education 0.092129 0.002309 39.89 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3397 on 4997 degrees of freedom
## Multiple R-squared: 0.5185, Adjusted R-squared: 0.5183
## F-statistic: 2690 on 2 and 4997 DF, p-value: < 2.2e-16
# Interpretation:
#Once we control for education, the coefficient on ln(wage) drops from 1.285 → 0.787.
#This means that part of the positive effect of wages on hours was actually due to education, since more educated people both earn more and work more.
#Education has an independent, positive effect: each extra year of schooling increases hours by about 0.09 hours (≈5.5 minutes) per day.
# --- (f) Full regression: add motivation ---
model3 <- lm(hours ~ ln_wage + education + motivation, data = df1)
cat("\n### Model 3: hours ~ ln_wage + education + motivation\n")
##
## ### Model 3: hours ~ ln_wage + education + motivation
summary(model3)
##
## Call:
## lm(formula = hours ~ ln_wage + education + motivation, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.36675 -0.06728 0.00121 0.06598 0.33396
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.9681690 0.0223144 267.46 <2e-16 ***
## ln_wage 0.2099211 0.0075306 27.88 <2e-16 ***
## education 0.0999287 0.0006756 147.91 <2e-16 ***
## motivation 0.2499828 0.0010803 231.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09925 on 4996 degrees of freedom
## Multiple R-squared: 0.9589, Adjusted R-squared: 0.9589
## F-statistic: 3.886e+04 on 3 and 4996 DF, p-value: < 2.2e-16
# --- Optional: Regression comparison table ---
#install.packages("stargazer")
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(model1, model2, model3, type = "text",
title = "Regression Results Comparison",
dep.var.labels = "Hours Worked",
covariate.labels = c("Log(Wage)", "Education (Years)", "Motivation"))
##
## Regression Results Comparison
## ========================================================================================================
## Dependent variable:
## ------------------------------------------------------------------------------------
## Hours Worked
## (1) (2) (3)
## --------------------------------------------------------------------------------------------------------
## Log(Wage) 1.285*** 0.787*** 0.210***
## (0.024) (0.024) (0.008)
##
## Education (Years) 0.092*** 0.100***
## (0.002) (0.001)
##
## Motivation 0.250***
## (0.001)
##
## Constant 3.636*** 4.134*** 5.968***
## (0.081) (0.071) (0.022)
##
## --------------------------------------------------------------------------------------------------------
## Observations 5,000 5,000 5,000
## R2 0.365 0.518 0.959
## Adjusted R2 0.365 0.518 0.959
## Residual Std. Error 0.390 (df = 4998) 0.340 (df = 4997) 0.099 (df = 4996)
## F Statistic 2,874.511*** (df = 1; 4998) 2,690.303*** (df = 2; 4997) 38,858.290*** (df = 3; 4996)
## ========================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# Interpretation:
# The fit improves dramatically (R² ≈ 0.96), meaning the model now explains almost all variation in hours worked.
# The effect of ln(wage) further decreases (0.21), showing that once we control for motivation and education, the direct wage–hours link is weaker.
# Motivation strongly affects hours: a one-unit increase in motivation increases hours by 0.25 hours (~15 minutes).
#Preferred Model:
#Model 3 is the most comprehensive and theoretically sound because:
#It includes key determinants of labor supply (education, motivation, wage).
#It explains most of the variation in hours.
#Coefficients are significant and directionally consistent with economic intuition.
```