# DCP134/184 - Avaliação de Políticas Públicas B

# install.packages("foreign", dep = T)
if (! "haven" %in% installed.packages()) install.packages("haven", dep = T) # for read_dta
library(haven)
if (! "Hmisc" %in% installed.packages()) install.packages("Hmisc", dep = T)  # for describe
library(Hmisc)
if (! "car" %in% installed.packages()) install.packages("car", dep = T) # for vif
library(car)
if (! "data.table" %in% installed.packages()) install.packages("data.table", dep = T)
library(data.table)
if (! "marginaleffects" %in% installed.packages()) install.packages("marginaleffects", dep = T) # for plotting
library(marginaleffects)
if (! "ggplot2" %in% installed.packages()) install.packages("ggplot2", dep = T) 
library(ggplot2)
if (! "stargazer" %in% installed.packages()) install.packages("stargazer", dep = T)
library(stargazer)
if (! "sandwich" %in% installed.packages()) install.packages("sandwich", dep = T) # for vcovHC
library(sandwich)
if (! "lmtest" %in% installed.packages()) install.packages("lmtest", dep = T) # for robust SE
library(lmtest)

rm()

# Set working directory
wd = "D:/Dropbox/ana2/UFMG/DCP184/DCP184 karruz 2025a/20250312 - aula 02,04-08 - controle estatístico etc/exercicios_em_sala"
setwd(wd) # Alternatively, use setwd("./") to set as the working directory the folder where this script has been saved 

# Bailey (2016, p. 251): chapter 5, question 4
# 4. What determines how much drivers are fined if they are stopped for speeding? Do demographics like age, gender, and race matter? To answer this question, we'll investigate traffic stops and citations in Massachusetts using data from Makowsky and Stratmann (2009). Even though state law sets a formula for tickets based on how fast the driver was driving, police officers in practice often deviate from the formula. The data in speeding_tickets_text.dta includes information on all traffic stops. It contains an amount for the fine for only those observations for which the police officer decided to assess a fine.

# Table 5.7: Variables for Speeding Ticket Data
# Variable name Description
# MPHover Miles per hour over the speed limit
# Amount Assessed fine for the ticket
# Age Age of driver

# a. Estimate a bivariate OLS model in which ticket amount is a function of age. Is age statistically significant? Is endogeneity possible?
dados <- as.data.frame(read_dta("speeding_tickets_text.dta"))
head(dados)
dim(dados)

summary(dados)

describe(dados$Amount)
describe(dados$Age)
describe(dados$MPHover)

mod4a = lm(Amount ~ Age, data = dados)
summary(mod4a)
coeftest(mod4a, vcov = vcovHC(mod4a, type="HC1"))

# b. Estimate the model from part (a) also controlling for miles per hour over the speed limit. Explain what happens to the coefficient on age and why.
mod4b = lm(Amount ~ Age + MPHover, data = dados)
summary(mod4b)
coeftest(mod4b, vcov = vcovHC(mod4b, type="HC1"))

# c. Suppose we had only the first 1,000 observations in the data set. Estimate the model from part (b) and report on what happens to the standard errors and t statistics when we have fewer observations. (In Stata, use if if _n <= 1000 at the end of the regression command to limit the sample to the first 1000 observations. Because the amount is missing for drivers who were not fined, the sample size will be much smaller than 1,000.).
mod4c = lm(Amount ~ Age + MPHover, data = dados[1:1000,])
summary(mod4c)
coeftest(mod4c, vcov = vcovHC(mod4c, type="HC1"))

# Additional analyses
# VIF
vif(mod4b)

# Add OutState
describe(dados$OutState)
as.data.table(table(dados$OutState))
mod4d = lm(Amount ~ Age + MPHover + OutState, data = dados)
summary(mod4d)
coeftest(mod4d, vcov = vcovHC(mod4d, type="HC1"))

# Define visual theme
theme_set(theme_minimal())
okabeito <- c('#E69F00', '#56B4E9', '#009E73', '#F0E442', '#0072B2', '#D55E00', '#CC79A7', '#999999', '#000000')
options(ggplot2.discrete.fill = okabeito)
options(ggplot2.discrete.colour = okabeito)
options(width = 1000)

# Plot mod4d
# https://marginaleffects.com/chapters/predictions.html#sec-predictions_visualization

plot_mod4d = marginaleffects::plot_predictions(mod4d,
            condition = c("MPHover", "OutState"),
            vcov = "HC1",
            gray = F) + # add rug = T for a rug plot
  labs(
    title = "Estimated assessed fine for the ticket, by miles per hour over the speed limit and plate procedence",
    subtitle = "",
    x = "Miles per hour over the speed limit",
    y = "Estimated assessed fine for the ticket (USD)",
    caption = "Source of raw data: Makowsky and Stratmann (2009)."
  ) + 
  guides(colour=guide_legend(title="Out of state")) +
  scale_fill_discrete(guide="none")

ggsave(plot_mod4d, 
       filename = "plot_mod4d.pdf",
       device = "pdf",
       height = 7.5, width = 13, units = "in")
ggsave(plot_mod4d, 
       filename = "plot_mod4d.png",
       device = "png",
       height = 7.5, width = 13, units = "in")

# Add interaction: MPHover*OutState
mod4e = lm(Amount ~ Age + MPHover + OutState + MPHover*OutState, data = dados)
summary(mod4e)
coeftest(mod4e, vcov = vcovHC(mod4e, type="HC1"))

# Plot mod4e
plot_mod4e = marginaleffects::plot_predictions(mod4e,
                                  condition = c("MPHover", "OutState"),
                                  vcov = "HC1",
                                  gray = F,
                                  rug = T) +
  labs(
    title = "Estimated assessed fine for the ticket, by miles per hour over the speed limit and plate procedence",
    subtitle = "",
    x = "Miles per hour over the speed limit",
    y = "Estimated assessed fine for the ticket (USD)",
    caption = "Source of raw data: Makowsky and Stratmann (2009)."
  ) + 
  guides(colour=guide_legend(title="Driver's residence")) +
  scale_fill_discrete(guide="none") +
  scale_color_hue(labels = c("In state", "Out of state"))

ggsave(plot_mod4e, 
       filename = "plot_mod4e.pdf",
       device = "pdf",
       height = 7.5, width = 13, units = "in")
ggsave(plot_mod4e, 
       filename = "plot_mod4e.png",
       device = "png",
       height = 7.5, width = 13, units = "in")

# Produce table of estimates, with robust standard errors
coeftest(mod4a, vcov = vcovHC(mod4a, type = "HC1"))
cov_mod4a <- (vcovHC(mod4a, type = "HC1"))
robust_se_mod4a <- sqrt(diag(cov_mod4a))

coeftest(mod4b, vcov = vcovHC(mod4b, type = "HC1"))
cov_mod4b <- (vcovHC(mod4b, type = "HC1"))
robust_se_mod4b <- sqrt(diag(cov_mod4b))

coeftest(mod4c, vcov = vcovHC(mod4c, type = "HC1"))
cov_mod4c <- (vcovHC(mod4c, type = "HC1"))
robust_se_mod4c <- sqrt(diag(cov_mod4c))

coeftest(mod4d, vcov = vcovHC(mod4d, type = "HC1"))
cov_mod4d <- (vcovHC(mod4d, type = "HC1"))
robust_se_mod4d <- sqrt(diag(cov_mod4d))

coeftest(mod4e, vcov = vcovHC(mod4e, type = "HC1"))
cov_mod4e <- (vcovHC(mod4e, type = "HC1"))
robust_se_mod4e <- sqrt(diag(cov_mod4e))

file.name = c(paste0(wd, "/table_of_estimates.html"))
e = c(file.exists(file.name))
if (e == TRUE) {
  file.remove(file.name)
}
stargazer(mod4a, mod4b, mod4c, mod4d, mod4e,
          type = c("html"),
          title="With robust standard errors",
          column.labels = c(""),
          out = file.name,
          decimal.mark = ",",
          se = list(robust_se_mod4a, robust_se_mod4b, robust_se_mod4c, robust_se_mod4d, robust_se_mod4e))