# Exemplo apresentado em Facure (2022), capítulo 11
# Baseado em Athey e Wagner (2019): https://doi.org/10.1353/obs.2019.0001
# https://matheusfacure.github.io/python-causality-handbook/11-Propensity-Score.html

if (! "Hmisc" %in% installed.packages()) install.packages("Hmisc", dep = T)  # for describe
if (! "MatchIt" %in% installed.packages()) install.packages("MatchIt", dep = T)
if (! "marginaleffects" %in% installed.packages()) install.packages("marginaleffects", dep = T)
if (! "dplyr" %in% installed.packages()) install.packages("dplyr", dep = T)
if (! "data.table" %in% installed.packages()) install.packages("data.table", dep = T)

library("Hmisc")
library("MatchIt")
library("marginaleffects")
library("dplyr")
library("data.table")

rm()

setwd("./")

# Ler banco de dados (learning_mindset.csv) e listar significado de suas variáveis
# Dados obtidos de: https://github.com/ethen8181/machine-learning/blob/master/ab_tests/causal_inference/data/learning_mindset.csv 
#####
dados <- read.csv("learning_mindset.csv")
head(dados)
dim(dados)

# Gerar id única do estudante (para análises adiante)
dados = dados %>% 
  mutate(id = as.character(row_number() + 100000000))
dados = as.data.frame(dados)
head(dados)
describe(dados$id)
id_all = dados$id

# Besides the treated (intervention) and outcome (achievement_score) variables, the study also recorded some other features:
# * schoolid: identifier of the student’s school;
# * success_expect: self-reported expectations for success in the future, a proxy for prior achievement, measured prior to random assignment;
# * ethnicity: categorical variable for student race/ethnicity;
# * gender: categorical variable for student identified gender;
# * frst_in_family: categorical variable for student first-generation status, i.e. first in family to go to college;
# * school_urbanicity: school-level categorical variable for urbanicity of the school, i.e. rural, suburban, etc;
# * school_mindset: school-level mean of students’ fixed mindsets, reported prior to random assignment, standardized;
# * school_achievement: school achievement level, as measured by test scores and college preparation for the previous 4 cohorts of students, standardized;
# * school_ethnic_minority: school racial/ethnic minority composition, i.e., percentage of student body that is Black, Latino, or Native American, standardized;
# * school_poverty: school poverty concentration, i.e., percentage of students who are from families whose incomes fall below the federal poverty line, standardized;
# * school_size: total number of students in all four grade levels in the school, standardized.
#####

# Gerar versão factor de variáveis categóricas: dados$f_ethnicity, dados$f_gender, dados$f_school_urbanicity
#####
dados$f_ethnicity = as.factor(dados$ethnicity)
dados$f_gender = as.factor(dados$gender)
dados$f_school_urbanicity = as.factor(dados$school_urbanicity)
#####

# Calcular o ATT via pareamento. Para tanto, remover as linhas que contenham missing values e aplicar os seguintes parâmetros:
# a. Distância euclidiana;
# b. Vizinho mais próximo;
# c. Com reposição;
# d. Variáveis de pareamento: ethnicity, gender, school_urbanicity,
#                             school_mindset, school_achievement, school_ethnic_minority,
#                             school_poverty, school_size; e
# e. Número de matches permitidos: 1; e
# f. Com correção por viés de pareamento. Could no find a way of performing bias correction with package MatchIt.
#####
dim(dados)
dados = dados[complete.cases(dados),]
dim(dados)

m1 <- matchit(intervention ~ f_ethnicity +
                f_gender +
                f_school_urbanicity +
                school_mindset +
                school_achievement +
                school_ethnic_minority +
                school_poverty +
                school_size,
              data = dados,
              distance = "scaled_euclidean",
              method = "nearest",
              replace = TRUE,
              ratio = 1)
m1

md1 <- match_data(m1)
head(md1)
tail(md1)
dim(md1)
describe(md1$weights) # from 0.239953 to 12.1702
describe(md1$id)
describe(md1$intervention)
table(md1$intervention) # 0 = 792/ 1 = 3384
id_md1 = md1$id[md1$intervention==0]
length(id_md1) # 792

# From https://cran.r-project.org/web/packages/MatchIt/MatchIt.pdf: "Matching with replacement: For matching with replacement, units are not assigned to unique strata. For the ATT, each treated unit gets a weight of 1. Each control unit is weighted as the sum of the inverse of the number of control units matched to the same treated unit across its matches. For example, if a control unit was matched to a treated unit that had two other control units matched to it, and that same control was matched to a treated unit that had one other control unit matched to it, the control unit in question would get a weight of 1/3 + 1/2 = 5/6."

# Visualizar distâncias calculadas # Demora alguns minutos
dist = scaled_euclidean_dist(~ as.factor(ethnicity) +
  as.factor(gender) +
  as.factor(school_urbanicity) +
  school_mindset +
  school_achievement +
  school_ethnic_minority +
  school_poverty +
  school_size,
data = dados)
View(dist) # Demora alguns minutos
dim(dist)

# Avaliar qualidade do pareamento
dim(dados)
head(md1)
tail(md1)
dim(md1) # for only the 4176 matched students
# From https://cran.r-project.org/web/packages/MatchIt/vignettes/MatchIt.html: "Values of standardized mean differences and eCDF statistics close to zero and values of variance ratios close to one indicate good balance [...]."
# Gemini: "In MatchIt in R, eCDF (Empirical Cumulative Distribution Function) refers to a plot and statistics used to assess covariate balance after matching. It visually compares the cumulative distribution of covariates between treatment and control groups. Deviations between the eCDF lines indicate distributional imbalance."
# Gemini: "In MatchIt in R, the absolute standardized mean difference (ASMD) is a measure used to assess covariate balance before and after matching or weighting. It's calculated by taking the absolute difference in means between the treatment and control groups for a specific covariate and dividing it by a standard deviation."
# From https://stats.stackexchange.com/questions/499397/matchit-package-what-is-matched-ess-in-the-summary-table: "The effective sample size (ESS) is the size of an unweighted sample carrying approximately the same precision as the weighted sample in question."

summary(m1)
plot(m1, type = "density", interactive = T,
     which.xs = ~ f_ethnicity +
       f_gender +
       f_school_urbanicity +
       school_mindset +
       school_achievement +
       school_ethnic_minority +
       school_poverty +
       school_size) # which.xs can be omitted
# From https://cran.r-project.org/web/packages/MatchIt/vignettes/assessing-balance.html: "The x-axis displays the covariate values and the y-axis displays the density of the sample at that covariate value. For categorical variables, the y-axis displays the proportion of the sample at that covariate value. The black line corresponds to the treated group and the gray line to the control group. Perfectly overlapping lines indicate good balance. Density plots display similar information to eCDF plots but may be more intuitive for some users because of their link to histograms."
# From https://cran.r-project.org/web/packages/MatchIt/MatchIt.pdf: "When interactive = TRUE, plots for three variables will be displayed at a time, and the prompt in the console allows you to move on to the next set of variables. When interactive = FALSE, multiple pages are plotted at the same time, but only the last few variables will be visible in the displayed plot. To see only a few specific variables at a time, use the which.xs argument to display plots for just those variables." 

plot(summary(m1)) # Love plot

# Calcular efeito: passo 1: calcular outcome model 
fit1a <- lm(achievement_score ~ intervention,
            data = md1, 
            weights = weights)

# Calcular efeito: passo 2: g-computation e intervalo de confiança
# From https://cran.r-project.org/web/packages/MatchIt/vignettes/estimating-effects.html: "To estimate marginal effects, we use a method known as g-computation (Snowden, Rose, and Mortimer 2011) or regression estimation (Schafer and Kang 2008). This involves first specifying a model for the outcome as a function of the treatment and covariates. Then, for each unit, we compute their predicted values of the outcome setting their treatment status to treated, and then again for control, leaving us with two predicted outcome values for each unit, which are estimates of the potential outcomes under each treatment level. We compute the mean of each of the estimated potential outcomes across the entire sample, which leaves us with two average estimated potential outcomes. Finally, the contrast of these average estimated potential outcomes (e.g., their difference or ratio, depending on the effect measure desired) is the estimate of the treatment effect. When doing g-computation after matching, a few additional considerations are required. First, when we take the average of the estimated potential outcomes under each treatment level, this must be a weighted average that incorporates the matching weights. Second, if we want to target the ATT or ATC, we only estimate potential outcomes for the treated or control group, respectively (though we still generate predicted values under both treatment and control)."

att1a = avg_comparisons(fit1a,
                       variables = "intervention",
                       vcov = "HC3",
                       newdata = subset(intervention == 1)) # para ATT
att1a

# Agora, adicionar covariáveis à equação que calcula efeito (passo 1)
fit1b <- lm(achievement_score ~ intervention + f_ethnicity +
                                                 f_gender +
                                                 f_school_urbanicity +
                                                 school_mindset +
                                                 school_achievement +
                                                 school_ethnic_minority +
                                                 school_poverty +
                                                 school_size,
           data = md1, 
           weights = weights)

att1b = avg_comparisons(fit1b,
                        variables = "intervention",
                        vcov = "HC3",
                        newdata = subset(intervention == 1))
att1b
#####

# Novamente, calcular o ATT via pareamento, mas desta vez utilizando o propensity score como medida sintética para pareamento. Os demais parâmetros de pareamento devem ser mantidos.
#####
m2 <- matchit(intervention ~ f_ethnicity +
                f_gender +
                f_school_urbanicity +
                school_mindset +
                school_achievement +
                school_ethnic_minority +
                school_poverty +
                school_size,
              data = dados,
              distance = "glm", # para usar propensity scores calculados via modelo logit tendo intervention como variável dependente
              method = "nearest",
              replace = TRUE,
              ratio = 1)
m2 # Abaixo analisaremos a diferença de linhas entre md1 (4176) e md2 (4196)

# Avaliar qualidade do pareamento
summary(m2)
plot(m2, type = "jitter", interactive = FALSE)
plot(m2, type = "density", interactive = T)

# Explicar razão de muitas matched control units serem representadas por círculos maiores 
#####
g2 = get_matches(
  m2,
  distance = "glm_get_matches", # name of a column
  weights = "weights_get_matches", # name of a column (not the same as the weights generated by match_data)
  subclass = "subclass_get_matches", # name of a column
  id = "id_get_matches", # name of a column
  data = dados,
  include.s.weights = TRUE
)
head(g2)
describe(g2$weights_get_matches) # all = 1
describe(g2$id_get_matches[g2$intervention==1])
describe(g2$id_get_matches[g2$intervention==0])
t = as.data.table(table(g2$id_get_matches[g2$intervention==0]))
t = t[order(t$N, decreasing=T),]
t # as many as 54 pairs with the same untreated unit

g1 = get_matches( # Analiser g1, ainda que não possamos comparar diretamente com jittered plot of g2 (distance = "scaled_euclidean" não permite produzir jittered plot)
  m1,
  distance = "scaled_euclidean_get_matches", # name of a column
  weights = "weights_get_matches", # name of a column (not the same as the weights generated by match_data)
  subclass = "subclass_get_matches", # name of a column
  id = "id_get_matches", # name of a column
  data = dados,
  include.s.weights = TRUE
)
head(g1)
describe(g1$weights_get_matches) # all = 1
describe(g1$id_get_matches[g1$intervention==1])
describe(g1$id_get_matches[g1$intervention==0])
t = as.data.table(table(g1$id_get_matches[g1$intervention==0]))
t = t[order(t$N, decreasing=T),]
t # as many as 52 pairs with the same untreated unit
#####

dim(dados)
md2 <- match_data(m2)
head(md2)
tail(md2)
dim(md2) # for only the 4196 matched students
describe(md2$weights) # from 0.239953 to 12.9574
describe(md2$distance)
describe(md2$id)
describe(md2$intervention)
table(md2$intervention) # 0 = 812/ 1 = 3384
id_md2 = md2$id[md2$intervention==0]
length(id_md2) # 812

# Analisar diferença em número de linhas entre md1 (4176) e md2 (4196)
#####
d = setdiff(id_md2, id_md1)
d
length(d) # 618

# Diferença de linhas entre md1 (4176) e md2 (4196) deve-se a um conjunto bastante diferente de unidades não tratadas que foram pareadas. Das 792 unidades não tratadas que foram pareadas em m1, 618 diferem das 812 unidades não tratadas que foram pareadas em m2.
#####

# Calcular efeito: passo 1: calcular outcome model
fit2a <- lm(achievement_score ~ intervention,
           data = md2, 
           weights = weights)

# Calcular efeito: passo 2: g-computation e intervalo de confiança
att2a = avg_comparisons(fit2a,
                       variables = "intervention",
                       vcov = "HC3",
                       newdata = subset(intervention == 1))
att2a

# Agora, adicionar covariáveis à equação que calcula efeito (passo 1)
fit2b <- lm(achievement_score ~ intervention + f_ethnicity +
                                                  f_gender +
                                                  f_school_urbanicity +
                                                  school_mindset +
                                                  school_achievement +
                                                  school_ethnic_minority +
                                                  school_poverty +
                                                  school_size,
           data = md2, 
           weights = weights)

att2b = avg_comparisons(fit2b,
                        variables = "intervention",
                        vcov = "HC3",
                        newdata = subset(intervention == 1))
att2b
#####

# RESUMO #########################
# Comparar qualidade do pareamento
#####
summary(m1)
summary(m2) # Pareamento com propensity score parece inferior
# From https://stats.stackexchange.com/questions/584456/how-to-interpret-the-standardized-pair-distances-std-pair-dist-in-matchit: This value [Std. Pair Dist.] is computed as the average of the distance between units within a pair on the given covariate. It's easiest to think about this with 1:1 matching. Consider a matched pair, and take the difference between the treated unit in that pair's value of the covariate and the control unit in that pair's value of the covariate. Then take the absolute value of this difference to make it a distance. Then take the average of these distances across all pairs. When standardize = TRUE (the default), the average is then standardized in the same way the standardized mean differences are.
#####

# Comparar efeitos estimados: ATT
#####
# distance = "scaled_euclidean", without covariates
att1a
# distance = "scaled_euclidean", with covariates
att1b
# distance = "glm", without covariates
att2a
# distance = "glm", with covariates
att2b
#####