# Install and activate required packages
#####
if (! "Hmisc" %in% installed.packages()) install.packages("Hmisc", dep = T)
if (! "dplyr" %in% installed.packages()) install.packages("dplyr", dep = T)
if (! "tidyr" %in% installed.packages()) install.packages("tidyr", dep = T)
if (! "sandwich" %in% installed.packages()) install.packages("sandwich", dep = T)
if (! "lmtest" %in% installed.packages()) install.packages("lmtest", dep = T)
if (! "plm" %in% installed.packages()) install.packages("plm", dep = T)
if (! "data.table" %in% installed.packages()) install.packages("data.table", dep = T)

library(Hmisc) # for describe
library(dplyr) # for %>%
library(tidyr) # drop_na
library(sandwich) # for vcovCL
library(lmtest) # for coeftest
library(plm) # for pdata.frame
library(data.table)
#####

# Read data and generate dataframe: data
#####
# https://search.r-project.org/CRAN/refmans/AER/html/StockWatson2007.html # for downloading dataset;
# dataset has been processed in R to generate Fatalities.csv
# To access data labels: http://fmwww.bc.edu/ec-p/data/stockwatson/fatality.des (variable names may differ from R's AER dataset)
data("Fatalities", package = "AER")
#####

# Export dataframe df as "Fatalities.csv"
#####
setwd("./")
write.csv(Fatalities, file = "Fatalities.csv", row.names = FALSE, fileEncoding = "UTF-8")
#####

# Generate working dataframe df as a copy of dataframe Fatalities
#####
df = Fatalities
dim(df)
head(df)
#####

# Generate variable number of traffic deaths per 10,000 people living in the state in the year: frate (the dependent variable)
#####
describe(df$fatal)
describe(df$pop)
df$frate = df$fatal/df$pop * 10000
describe(df$frate)
print(df[1:40, c("state", "year", "frate")])
#####

# Describe beertax, the independent variable of interest
#####
# The tax on a case of beer, which is an available measure of state alcohol taxes more generally
describe(df$beertax)
print(df[1:40, c("state", "year", "beertax")])
#####

# Generate control variable indicating state punishment for drunk driving (mandatory jail sentence or mandatory community service): punish
#####
df$punish = factor(df$jail == "yes" | df$service == "yes", labels = c("no", "yes"))
print(df[1:40, c("jail", "service", "punish")])
describe(df$punish)
table(df$punish)
table(df$jail, df$service)
#####

# Check within-state variation of dummy punish
#####
# To make this control viable in a FE model, there must be some intra-state variation 
var_punish = df %>%
  group_by(state, punish) %>%
  summarise(count = n(), .groups = 'drop')
print(as.data.frame(var_punish))
#####

# Drop row if relevant variables are missing
#####
dim(df)
df <- df %>%
  drop_na(frate, beertax, jail, service, state, year)
dim(df)
#####

# Drop row if year is different from 1982 or 1988
#####
# Doing this just to remind us that fixed effects are viable with as few as two time periods
df = df[(df$year==1982 | df$year==1988),]
dim(df)
#####

# Check once again within-state variation of dummy punish
#####
# To make this control viable in a FE model, there must be some intra-state variation 
var_punish = df %>%
  group_by(state, punish) %>%
  summarise(count = n(), .groups = 'drop')
print(as.data.frame(var_punish))
print(df[1:40, c("state", "year", "beertax")])
#####

# Generate dummy out of character variable punish: punish_d
#####
describe(df$punish)
df$punish_d <- ifelse(df$punish == "yes", 1, 0)
describe(df$punish_d)
as.data.table(table(df$punish_d))
#####

# Estimate Pooled OLS model
#####
pooled = lm(frate ~ beertax + punish_d, data=df) # Could also use as.factor(punish) instead of punish_d
cluster_se <- vcovCL(pooled, cluster = ~ state)
coeftest(pooled, vcov = cluster_se)
#####

# Estimate entity FE model using dummies: Least Squares Dummy Variables (LSDV) estimator
#####
lsdv1 = lm(frate ~ beertax + as.factor(punish) + as.factor(state), data=df)
cluster_se <- vcovCL(lsdv1, cluster = ~ state)
coeftest(lsdv1, vcov = cluster_se)
# Um aumento de USD 1 em beertax associa-se com um redução de 0,8681 nas mortes por 10.000 habitantes, mantendo-se constante o rigor da punição praticada pelos estados e fatores incidentes sobre mortes no trânsito que variam entre estados mas são constantes no intervalo temporal analisado (1982-1988). O efeito de beertax é diferente de zero ao nível de 5% de significância (p-valor = 0,016).
#####

# Estimate entity FE model: demeaning
#####
# Generate panel-like dataframe: df_panel
df_panel <- pdata.frame(df, index = c("state", "year"))
class(df_panel)

demean1 = plm(frate ~ 1 + beertax + as.factor(punish), data=df_panel, model = "within", effect = "individual")
# To access the intercept: https://www.rdocumentation.org/packages/plm/versions/2.6-4/topics/within_intercept
# NB: the model identifies itself as a pooling model, e.g., in summary().
# within_intercept: This function gives an overall intercept for within models and its accompanying standard error or a within model with the overall intercept. "It can be considered an overall intercept in the within model framework and is the weighted mean of fixed effects. [...] with within_intercept one also gets the associated standard error and it is possible to get an overall intercept for two-way fixed effect models." See plm's documentation: https://cran.r-project.org/web/packages/plm/plm.pdf
overallint = within_intercept(demean1, return.model = TRUE)
summary(overallint, vcov = function(x) vcovHC(x, type = "HC1", cluster = "group")) 
#####

# Estimate entity and time FE model using dummies: Least Squares Dummy Variables (LSDV) estimator
#####
lsdv2 = lm(frate ~ beertax + as.factor(punish) + as.factor(state) + as.factor(year), data=df)
cluster_se <- vcovCL(lsdv2, cluster = ~ state)
coeftest(lsdv2, vcov = cluster_se)
# Um aumento de USD 1 em beertax associa-se com um redução de 1,0972 nas mortes por 10.000 habitantes, mantendo-se constante o rigor da punição praticada pelos estados, fatores incidentes sobre mortes no trânsito que variam entre estados mas são constantes no intervalo temporal analisados (1982-1988), e fatores variantes no intervalo temporal analisado mas incidentes sobre todos os estados. O efeito de beertax é diferente de zero ao nível de 5% de significância (p-valor = 0,016).
#####

# Estimate entity and time FE model: demeaning
#####
demean2 = plm(frate ~ 1 + beertax + as.factor(punish), data=df_panel, model = "within", effect = "twoways")
overallint = within_intercept(demean2, return.model = TRUE)
summary(overallint, vcov = function(x) vcovHC(x, type = "HC1", cluster = "group")) 
#####

# Estimate random effects model
#####
random = plm(frate ~ 1 + beertax + as.factor(punish), data=df_panel, model = "random", effect = "individual")
vcov_clustered <- vcovHC(random, type = "HC1", cluster = "group")
coeftest(random, vcov = vcov_clustered)
#####

# Estimate mixed effects model
#####
mixed = plm(frate ~ 1 + beertax + as.factor(punish) + as.factor(year), data=df_panel, model = "random", effect = "individual")
vcov_clustered <- vcovHC(mixed, type = "HC1", cluster = "group")
coeftest(mixed, vcov = vcov_clustered)
#####

# Run Hausman test
#####
phtest(demean2, mixed) # Favor fixed effects model over mixed effects model
#####

# FURTHER STYDYING FE: MANUALLY ESTIMATE DEMEANING MODELS #########################################################
# Generate demeaned variables
#####
# Generate state demeaned variables
df = df %>%
  group_by(state) %>%
  mutate(mean_s_frate = mean(frate, na.rm = TRUE),
         mean_s_beertax = mean(beertax, na.rm = TRUE),
         mean_s_punish = mean(punish_d, na.rm = TRUE)) %>% 
  mutate(frate_sd = frate - mean_s_frate,
         beertax_sd = beertax - mean_s_beertax,
         punish_sd = punish_d - mean_s_punish) %>% ungroup()
describe(df$frate_sd)
describe(df$beertax_sd)
describe(df$punish_sd) # all three new variables with mean very close to zero

# Generate year demeaned variables
df = df %>%
  group_by(year) %>%
  mutate(mean_y_frate = mean(frate, na.rm = TRUE),
         mean_y_beertax = mean(beertax, na.rm = TRUE),
         mean_y_punish = mean(punish_d, na.rm = TRUE)) %>% 
  mutate(frate_yd = frate - mean_y_frate,
         beertax_yd = beertax - mean_y_beertax,
         punish_yd = punish_d - mean_y_punish) %>% ungroup()
describe(df$frate_yd)
describe(df$beertax_yd)
describe(df$punish_yd) # all three new variables with mean very close to zero

# Generate state and year demeaned variables
df = df %>%
  group_by(year) %>%
  mutate(mean_sy_frate = mean(frate_sd, na.rm = TRUE),
         mean_sy_beertax = mean(beertax_sd, na.rm = TRUE),
         mean_sy_punish = mean(punish_sd, na.rm = TRUE)) %>% 
  mutate(frate_syd = frate_sd - mean_sy_frate,
         beertax_syd = beertax_sd - mean_sy_beertax,
         punish_syd = punish_sd - mean_sy_punish) %>% ungroup()
describe(df$frate_syd)
describe(df$beertax_syd)
describe(df$punish_syd) # all three new variables with mean very close to zero

colnames(df)

# Generate year and state demeaned variables, to check whether order makes a difference
df = df %>%
  group_by(state) %>%
  mutate(mean_ys_frate = mean(frate_yd, na.rm = TRUE),
         mean_ys_beertax = mean(beertax_yd, na.rm = TRUE),
         mean_ys_punish = mean(punish_yd, na.rm = TRUE)) %>% 
  mutate(frate_ysd = frate_yd - mean_ys_frate,
         beertax_ysd = beertax_yd - mean_ys_beertax,
         punish_ysd = punish_yd - mean_ys_punish) %>% ungroup()
describe(df$frate_ysd)
describe(df$beertax_ysd)
describe(df$punish_ysd) # all three new variables with mean very close to zero

describe(df$frate_syd==df$frate_ysd) # all FALSE
describe(df$beertax_syd==df$beertax_ysd) # all FALSE
describe(df$beertax_syd==df$beertax_ysd) # all FALSE

colnames(df)
#####

# Estimate state fixed effects models
#####
pooled_sd = lm(frate_sd ~ beertax_sd + punish_sd, data=df)
cluster_se <- vcovCL(pooled_sd, cluster = ~ state)
coeftest(pooled_sd, vcov = cluster_se)

pooled_sd = lm(frate_sd ~ 0 + beertax_sd + punish_sd, data=df)
cluster_se <- vcovCL(pooled_sd, cluster = ~ state)
coeftest(pooled_sd, vcov = cluster_se) # omitting intercept does not make a difference, excepting for decreasing the se

demean_sd = plm(frate ~ beertax + punish_d, data=df_panel, model = "within", effect = "individual")
summary(demean_sd, vcov = function(x) vcovHC(x, type = "HC1", cluster = "group")) 
# overallint = within_intercept(demean_sd, return.model = TRUE)
# summary(overallint, vcov = function(x) vcovHC(x, type = "HC1", cluster = "group")) 
#####

# Estimate time fixed effects models
#####
pooled_yd = lm(frate_yd ~ beertax_yd + punish_yd, data=df)
cluster_se <- vcovCL(pooled_yd, cluster = ~ state)
coeftest(pooled_yd, vcov = cluster_se)

pooled_yd = lm(frate_yd ~ 0 + beertax_yd + punish_yd, data=df)
cluster_se <- vcovCL(pooled_yd, cluster = ~ state)
coeftest(pooled_yd, vcov = cluster_se) # omitting intercept does not make a difference, excepting for decreasing the se

demean_yt = plm(frate ~ beertax + punish_d, data=df_panel, model = "within", effect = "time")
summary(demean_yt, vcov = function(x) vcovHC(x, type = "HC1", cluster = "group")) 
# overallint = within_intercept(demean_yt, return.model = TRUE)
# summary(overallint, vcov = function(x) vcovHC(x, type = "HC1", cluster = "group")) 
#####

# Estimate state and time fixed effects models: syd
#####
pooled_syd = lm(frate_syd ~ beertax_syd + punish_syd, data=df)
cluster_se <- vcovCL(pooled_syd, cluster = ~ state)
coeftest(pooled_syd, vcov = cluster_se)

pooled_syd = lm(frate_syd ~ 0 + beertax_syd + punish_syd, data=df)
cluster_se <- vcovCL(pooled_syd, cluster = ~ state)
coeftest(pooled_syd, vcov = cluster_se) # omitting intercept does not make a difference, excepting for decreasing the se

pooled_syd_1 = lm(frate_sd ~ 0 + beertax_sd + punish_syd + as.factor(year), data=df)
cluster_se <- vcovCL(pooled_syd_1, cluster = ~ state)
coeftest(pooled_syd_1, vcov = cluster_se)

pooled_syd_2 = lm(frate_yd ~ 0 + beertax_yd + punish_syd + as.factor(state), data=df)
cluster_se <- vcovCL(pooled_syd_2, cluster = ~ state)
coeftest(pooled_syd_2, vcov = cluster_se)

demean_syd = plm(frate ~ beertax + punish_d, data=df_panel, model = "within", effect = "twoways")
summary(demean_syd, vcov = function(x) vcovHC(x, type = "HC1", cluster = "group")) 
# overallint = within_intercept(demean_syd, return.model = TRUE)
# summary(overallint, vcov = function(x) vcovHC(x, type = "HC1", cluster = "group")) 
##### 

# Estimate state and time fixed effects models: ysd (intercept changes, otherwise similar, but not identical, results as with syd)
#####
pooled_ysd = lm(frate_ysd ~ beertax_ysd + punish_ysd, data=df)
cluster_se <- vcovCL(pooled_ysd, cluster = ~ state)
coeftest(pooled_ysd, vcov = cluster_se)

pooled_ysd = lm(frate_ysd ~ 0 + beertax_ysd + punish_ysd, data=df)
cluster_se <- vcovCL(pooled_ysd, cluster = ~ state)
coeftest(pooled_ysd, vcov = cluster_se) # omitting intercept does not make a difference, excepting for decreasing the se

pooled_ysd_1 = lm(frate_sd ~ 0 + beertax_sd + punish_ysd + as.factor(year), data=df)
cluster_se <- vcovCL(pooled_ysd_1, cluster = ~ state)
coeftest(pooled_ysd_1, vcov = cluster_se)

pooled_ysd_2 = lm(frate_yd ~ 0 + beertax_yd + punish_ysd + as.factor(state), data=df)
cluster_se <- vcovCL(pooled_ysd_2, cluster = ~ state)
coeftest(pooled_ysd_2, vcov = cluster_se)

demean_ysd = plm(frate ~ beertax + punish_d, data=df_panel, model = "within", effect = "twoways")
summary(demean_ysd, vcov = function(x) vcovHC(x, type = "HC1", cluster = "group"))
# overallint = within_intercept(demean_ysd, return.model = TRUE)
# summary(overallint, vcov = function(x) vcovHC(x, type = "HC1", cluster = "group"))
#####