##-----------------------------------------------------------------------------------------------#
## Date:         16-09-2024
## Author:       Giacomo Benini 
## Institution:  ERE, NHH - ERE, Stanford
##
## Project:     "The Free lunch of Climate Change Mitigation"
## R-codes:     "Regression Analysis"
##-----------------------------------------------------------------------------------------------#

rm(list = ls(all=TRUE)) 
options('scipen' = 100, 'digits' = 4)

##-----------------------------------------------------------------------------------------------#
## Load required packages
##-----------------------------------------------------------------------------------------------#

library(tidyverse)
library(ggplot2)
library(fuzzyjoin)
library(scales)

library(Matrix)
library(TMB)
library(data.table)

library(plm)

setwd("C:/Users/s15535/Desktop/2 - Energy Economics/2 - Oil & Natural Gas/2.5 - Flaring & Venting/1 - American FV&L/2 - Codes/1 - Upstream Emissions")

##-----------------------------------------------------------------------------------------------#
## 1. First Stage Regression (Tobit Flaring)
##-----------------------------------------------------------------------------------------------#
panel <- read.csv('C:/Users/s15535/Desktop/2 - Energy Economics/2 - Oil & Natural Gas/2.5 - Flaring & Venting/1 - American FV&L/2 - Codes/1 - Upstream Emissions/F&V_Dataset.csv', sep = ',', header = TRUE, stringsAsFactors = FALSE, encoding = 'UTF-8')

# 1.1 Time frame
Start <- 2005  
End   <- 2020

panel <- panel %>%
  filter(Year >= Start & Year <= End)

# 1.2 Definition of Routine Flaring

panel_gas <- panel %>%
  filter(Geo == 'Coalbed Methane' | Geo == 'Shale & Tight Gas') %>%
  mutate(Flaring_Rate = (Flared_Gas_B_Day / TotGas_B_Day)) %>%
  filter(Flaring_Rate <= 0.0516289279)

quantile(panel_gas$Flaring_Rate, probs = seq(0, 1, 0.05), na.rm = TRUE, names = TRUE)

# 1.3 Rystad vs EIA
panel_oil <- panel %>% 
  filter(Geo == 'Heavy & Extra Heavy' | Geo == 'Light & Medium' | Geo == 'Other Oil' | Geo == 'Shale & Tight Oil') %>%
  filter(GOR_SCF_B < 100000) %>% # Threshold defined by the EIA definition of an oilfield
  filter(!is.na(Flared_Gas_B_Day)) %>%
  mutate(Non_Routine_Flaring_B_Day = ifelse(quantile(panel_gas$Flaring_Rate, probs = 0.5, na.rm = TRUE) * TotGas_B_Day <= Flared_Gas_B_Day, quantile(panel_gas$Flaring_Rate, probs = 0.5, na.rm = TRUE) * TotGas_B_Day, Flared_Gas_B_Day),
         Routine_Flaring_B_Day     = ifelse(Flared_Gas_B_Day - Non_Routine_Flaring_B_Day <= 0, 0, Flared_Gas_B_Day - Non_Routine_Flaring_B_Day), 
         Dummy                     = ifelse(Routine_Flaring_B_Day == 0, 0, 1)) %>%
  distinct() 

summary(panel_oil$Flared_Gas_B_Day - panel_oil$Non_Routine_Flaring_B_Day - panel_oil$Routine_Flaring_B_Day)

# 1.2 Panel flare
panel_flare <- panel_oil %>%
  select(RE_ID, Asset, Year, Flared_Gas_B_Day, HH_Dollars_B, Dummy)

# 1.3 Set up optimization matrix
price <- panel_flare %>% 
  group_by(Year) %>% 
  summarise(price = mean(HH_Dollars_B)) %>% 
  pull

y_tible <- panel_flare %>% 
  pivot_wider(id_cols     = RE_ID,
              values_from = Flared_Gas_B_Day,
              names_from  = Year) %>% 
  select(c('RE_ID', as.character(Start:End)))

ID <- y_tible$RE_ID
y  <- y_tible %<>% select(-RE_ID) %>% as.matrix    

rownames(y) = ID

censor_tible <- panel_flare %>% 
  pivot_wider(id_cols     = RE_ID,
              values_from = Dummy,
              names_from  = Year) %>% 
  select(c('RE_ID', as.character(Start:End)))

ID     <- censor_tible$RE_ID
censor <- censor_tible %<>% select(-RE_ID) %>% as.matrix    

rownames(censor) = ID

data <- list(y = y, censor = censor, price = price)

# (reasonable) initial values
n           <- nrow(y)
time        <- ncol(y)
alpha0      <- rep(0, n)
sigma       <- sd(y, na.rm = TRUE)
sigma_alpha <- sd(colMeans(y, na.rm = TRUE))
alpha1      <- 0 

parameters <- list(alpha1 = alpha1, log_sigma = log(sigma), log_sigma_alpha = log(sigma_alpha), alpha0 = alpha0)

# Compile likelihood
compile("flare.cpp")
dyn.load(dynlib("flare"))

# Fit the model
obj <- MakeADFun(data = data, parameters = parameters, random = "alpha0", DLL = "flare")
obj$fn(obj$par)

# Optimizing
opt <- nlminb(obj$par, obj$fn, control = list(trace = TRUE))
opt  

# SDreport
sdrep <- sdreport(obj, par.fixed = opt$par)
res   <- summary(sdrep, "report") # Looks good

# Get random effects
alpha0 = obj$env$report()$alpha0
hist(alpha0)

# Get the errors
error  <- matrix(NA, nrow = n, ncol = time)

alpha1 <- res[1,1]

for(i in 1:n){
  for(t in 1:time){
    
    error[i,t] <- y[i,t] - alpha0[i] - alpha1*price[t]
  }
}
hist(error)

response <- matrix(NA, nrow = n, ncol = time)

for(i in 1:n){
  for(t in 1:time){
    
    response[i,t] <- alpha0[i] + alpha1*price[t]
  }
}
hist(response)

# 1.5 Analysis of the First-Sage Correction Term

error <- as.data.table(error) %>%
  pivot_longer(everything()) %>%
  rename(eta_hat = value)

response <- as.data.table(response) %>%
  pivot_longer(everything()) %>%
  rename(Flared_Gas_B_Day_hat = value)

est <- bind_cols(error, response) %>%
  drop_na() %>%
  bind_cols(panel_flare, .) %>%
  select(RE_ID, Asset, Year, eta_hat, Flared_Gas_B_Day_hat)

alpha <- as.data.table(alpha0) %>%
  pivot_longer(everything()) %>%
  rename(alpha0_hat = value)

panel_RE_ID <- panel_flare %>%
  distinct(RE_ID) %>%
  bind_cols(., alpha) %>%
  select(RE_ID, alpha0_hat)

# 1.6 Final Dataset
panel_oil <- left_join(panel_oil, est) %>%
  left_join(., panel_RE_ID) %>%
  mutate(x         = -Flared_Gas_B_Day_hat/sigma, 
         eta_hat   = ifelse(Routine_Flaring_B_Day == 0, -sigma*dnorm(x,0,1)/pnorm(x,0,1), eta_hat), 
         eta_hat_s = (eta_hat - mean(eta_hat)) / sd(eta_hat), 
         Inter_1   = HH_Dollars_B * Dummy, 
         Inter_2   = eta_hat * Dummy, 
         Inter_3   = HH_Dollars_B * Re_Gas_B_Day, 
         Inter_4   = eta_hat_s * Re_Gas_B_Day, 
         Inter_5   = HH_Dollars_B * Insitu_Gas_B_Day, 
         Inter_6   = eta_hat_s * Insitu_Gas_B_Day) %>%
  distinct() %>%
  drop_na(Inter_1, Inter_2, Inter_3, Inter_4, Inter_5, Inter_6) %>%
  mutate(alpha1_hat = alpha1) %>%
  select(RE_ID, Asset, Year, Longitude, Latitude, Oil_Gas, Geo,
         Oil_B_Day, 
         TotGas_B_Day, Sold_Gas_B_Day, Flared_Gas_B_Day, Non_Routine_Flaring_B_Day, Routine_Flaring_B_Day, Other_Gas_B_Day, Insitu_Gas_B_Day, Re_Gas_B_Day, 
         HH_Dollars_B, Lag_HH_Future_Dollar_B, WTI_Dollars_B, eta_hat,  
         Dummy, Inter_1, Inter_2, Inter_3, Inter_4, Inter_5, Inter_6, alpha0_hat, alpha1_hat, GOR_SCF_B)

rm(censor, censor_tible, data, error, est, obj, panel, panel_flare, parameters, res, sdrep, y, y_tible, alpha0, i, ID, n, opt, price, response, sigma, sigma_alpha, t, time) 
rm(alpha1, End, Start)

summary(panel_oil)

# 1.7 eta_hat Analysis
panel_oil <- panel_oil %>%
  mutate(sel     = ifelse(Routine_Flaring_B_Day > 0, 'Routine Flare > 0', 'Otherwise'), 
         sel_geo = interaction(sel, Geo, sep = '-'))

tapply(panel_oil$eta_hat, panel_oil$sel, summary)
tapply(panel_oil$eta_hat, panel_oil$sel, quantile, probs = 0:10/10)
tapply(panel_oil$eta_hat, panel_oil$sel, sd)

tapply(panel_oil$eta_hat, panel_oil$sel_geo, summary, digits = 4)
tapply(panel_oil$eta_hat, panel_oil$sel_geo, quantile, probs = 0:10/10)
tapply(panel_oil$eta_hat, panel_oil$sel_geo, sd)

#test <- wlicox.test(panel_oil$eta_hat ~ panel_oil$sel)
#test

panel_plot <- panel_oil %>%
  filter(sel == 'Routine Flare > 0')

max(panel_plot$eta_hat, na.rm = TRUE) / min(panel_plot$eta_hat, na.rm = TRUE)

# 1.8 eta_hat Plot
dens_eta <- ggplot(data = panel_plot) + 
  geom_density(aes(x = eta_hat), size = 1) + 
  theme_minimal() + 
  xlim(-100, 500) + 
  labs(x = 'BOE/Day', y = 'Empirical Density') + 
  theme(legend.direction = "horizontal", legend.position = "bottom", axis.text.x = element_text(angle = 90, hjust = 1)) + 
  guides(fill = guide_legend(title.position = 'top', title.hjust = 0.5, label.position = 'top'))

dens_eta

library(RColorBrewer)
nb.cols  <- 6 # Define the number of colors you want
mycolors <- colorRampPalette(brewer.pal(11, "RdBu"))(nb.cols)

dens_eta_geo <- ggplot(data = panel_plot) + 
  geom_density(aes(x = eta_hat, color = Geo), size = 1) + 
  theme_minimal() + 
  xlim(-100, 200) + 
  scale_color_manual(values = mycolors) + 
  labs(x = 'BOE/Day', y = 'Empirical Density') + 
  theme(legend.direction = "horizontal", legend.position = "bottom", axis.text.x = element_text(angle = 90, hjust = 1)) + 
  guides(fill = guide_legend(title.position = 'top', title.hjust = 0.5, label.position = 'top'))

dens_eta_geo

ggsave("C:/Users/s15535/Desktop/2 - Energy Economics/2 - Oil & Natural Gas/2.5 - Flaring & Venting/1 - American FV&L/0 - Manuscript/3 - A Zero-Cost Policy for Eliminating Methane Emissions in the Oil & Gas Industry - Appendix/Figure4_Appendix_EtaHatDensity.pdf", height = 15, width = 20, units = c('cm'))

write.csv(panel_oil, file ='C:/Users/s15535/Desktop/2 - Energy Economics/2 - Oil & Natural Gas/2.5 - Flaring & Venting/1 - American FV&L/2 - Codes/1 - Upstream Emissions/First_Stage_Results.csv', row.names = FALSE)
rm(list=ls(all=TRUE))

##-----------------------------------------------------------------------------------------------#
## 2. Second Stage Regression (Linear Regression OtherGas) 
##-----------------------------------------------------------------------------------------------#
panel_oil <- read.csv('C:/Users/s15535/Desktop/2 - Energy Economics/2 - Oil & Natural Gas/2.5 - Flaring & Venting/1 - American FV&L/2 - Codes/1 - Upstream Emissions/First_Stage_Results.csv', sep = ',', header = TRUE, stringsAsFactors = FALSE, encoding = 'UTF-8')

length(unique(panel_oil$RE_ID)) # 556 Oil & Gas Fields 
length(unique(panel_oil$Year))  #  16 Years (2005-2020)

# 2.1 Explanatory Analysis of the Dependent Variable

summary(panel_oil$Other_Gas_B_Day)

tapply(panel_oil$Other_Gas_B_Day, panel_oil$Geo, summary, digits = 2)
tapply(panel_oil$Other_Gas_B_Day, panel_oil$Geo, sd)

plot(density(panel_oil$Other_Gas_B_Day, na.rm = T))

library(car)
scatterplot(Other_Gas_B_Day ~ Year | Geo, boxplots = FALSE, smooth = TRUE, reg.line = FALSE, data = panel_oil)

library(gplots)
plotmeans(Other_Gas_B_Day ~ Year, main = "Heterogeineity across years", data = panel_oil)

# 2.2 Stationary
panel_sta <- plm.data(panel_oil, indexes = c('RE_ID', 'Year'))

library(tseries)

adf.test(panel_sta$Other_Gas_B_Day, k = 1) # no unit root present for 1 lag (p-value = 0.01 < 0.05)
adf.test(panel_sta$Other_Gas_B_Day, k = 2) # no unit root present for 2 lags (p-value = 0.01 < 0.05)

# 2.3 Heteroskedasticity
library(lmtest)
bptest(Other_Gas_B_Day ~ Dummy + Inter_1 + Inter_2 + Oil_B_Day + Lag_HH_Future_Dollar_B + eta_hat + Re_Gas_B_Day + Insitu_Gas_B_Day + Inter_3 + Inter_4 + Inter_5 + Inter_6, data = panel_oil, studentize = F) # no homoskedasticity (p-value = 0.0000000000000002 < 0.05)

# 2.4 Serial Correlation among Errors
pols <- plm(Other_Gas_B_Day ~ Dummy + Inter_1 + Inter_2 + Oil_B_Day + Lag_HH_Future_Dollar_B + eta_hat + Re_Gas_B_Day + Insitu_Gas_B_Day + Inter_3 + Inter_4 + Inter_5 + Inter_6, data = panel_oil, index = c('RE_ID', 'Year'), effect = 'individual', model = 'pooling')
summary(pols)
pbgtest(pols) # reject H0: no serial correlation in idiosyncratic errors p-value <0.0000000000000002 

rem <- plm(Other_Gas_B_Day ~ Dummy + Inter_1 + Inter_2 + Oil_B_Day + Lag_HH_Future_Dollar_B + eta_hat + Re_Gas_B_Day + Insitu_Gas_B_Day + Inter_3 + Inter_4 + Inter_5 + Inter_6, data = panel_oil, index = c('RE_ID', 'Year'), effect = 'individual', model = 'random')
summary(rem)
pbgtest(rem) # reject H0: no serial correlation in idiosyncratic errors p-value <0.0000000000000002 

fem <- plm(Other_Gas_B_Day ~ Dummy + Inter_1 + Inter_2 + Oil_B_Day + Lag_HH_Future_Dollar_B + eta_hat + Re_Gas_B_Day + Insitu_Gas_B_Day + Inter_3 + Inter_4 + Inter_5 + Inter_6, data = panel_oil, index = c('RE_ID', 'Year'), effect = 'individual', model = 'within')
summary(fem)
pbgtest(fem) # reject H0: no serial correlation in idiosyncratic errors p-value <0.0000000000000002 

pFtest(fem, pols)
phtest(fem, rem) # fixed effect model is better!

# 2.5 PGGLS Method
pols <- pggls(Other_Gas_B_Day ~ Dummy + Inter_1 + Inter_2 + Oil_B_Day + Lag_HH_Future_Dollar_B + eta_hat + Re_Gas_B_Day + Insitu_Gas_B_Day + Inter_3 + Inter_4 + Inter_5 + Inter_6, data = panel_oil, index = c('RE_ID', 'Year'), model = 'pooling')
summary(pols)

fixed <- pggls(Other_Gas_B_Day ~ 0 + Dummy + Inter_1 + Inter_2 + Oil_B_Day + Lag_HH_Future_Dollar_B + eta_hat + Re_Gas_B_Day + Insitu_Gas_B_Day + Inter_3 + Inter_4 + Inter_5 + Inter_6, data = panel_oil, index = c('RE_ID', 'Year'), effect = 'individual', model = 'within')
summary(fixed)

fe <- as.data.frame(fixef(fixed)) %>%
  mutate(RE_ID = rownames(.)) 

panel_fv <- left_join(panel_oil, fe) %>%
  mutate(Term_1     = `fixef(fixed)`,                                     # UVent
         Term_2     = fixed$coefficients[1] * Dummy,                      # IVent
         Term_3     = fixed$coefficients[2] * HH_Dollars_B * Dummy,       # IVent
         Term_4     = fixed$coefficients[3] * eta_hat * Dummy,            # IVent
         Term_5     = fixed$coefficients[4] * Oil_B_Day,                  # UVent
         Term_6     = fixed$coefficients[5] * Lag_HH_Future_Dollar_B,     # UVent
         Term_7     = fixed$coefficients[6] * eta_hat, 
         delta2_hat = fixed$coefficients[2], 
         delta4_hat = fixed$coefficients[4], 
         delta5_hat = fixed$coefficients[5]) %>%         
  rowwise() %>%
  mutate(IVented_Gas_B_Day = sum(Term_2, Term_3, Term_4), 
         UVented_Gas_B_Day = sum(Term_1, Term_5, Term_6, Term_7)) %>%
  ungroup() %>%
  select(RE_ID, Asset, Year, Longitude, Latitude, Oil_Gas, Geo,  
         Oil_B_Day, TotGas_B_Day, Sold_Gas_B_Day,  
         Flared_Gas_B_Day, Non_Routine_Flaring_B_Day, Routine_Flaring_B_Day, 
         IVented_Gas_B_Day, UVented_Gas_B_Day,  
         Insitu_Gas_B_Day, Re_Gas_B_Day, 
         GOR_SCF_B, HH_Dollars_B, WTI_Dollars_B, 
         alpha0_hat, alpha1_hat, delta2_hat, delta4_hat, delta5_hat) 

quantile(panel_fv$Non_Routine_Flaring_B_Day, probs = seq(0, 1, 0.05), na.rm = TRUE, names = TRUE)
quantile(panel_fv$Routine_Flaring_B_Day, probs = seq(0, 1, 0.05), na.rm = TRUE, names = TRUE)
quantile(panel_fv$IVented_Gas_B_Day, probs = seq(0, 1, 0.05), na.rm = TRUE, names = TRUE)
quantile(panel_fv$UVented_Gas_B_Day, probs = seq(0, 1, 0.05), na.rm = TRUE, names = TRUE)

tapply(panel_fv$Non_Routine_Flaring_B_Day, panel_fv$Geo, summary, digits = 2)
tapply(panel_fv$Non_Routine_Flaring_B_Day, panel_fv$Geo, sd)


tapply(panel_fv$Routine_Flaring_B_Day, panel_fv$Geo, summary, digits = 2)
tapply(panel_fv$Routine_Flaring_B_Day, panel_fv$Geo, sd)

tapply(panel_fv$IVented_Gas_B_Day, panel_fv$Geo, summary, digits = 2)
tapply(panel_fv$IVented_Gas_B_Day, panel_fv$Geo, sd)

tapply(panel_fv$UVented_Gas_B_Day, panel_fv$Geo, summary, digits = 2)
tapply(panel_fv$UVented_Gas_B_Day, panel_fv$Geo, sd)

# 2.1 Model Mistakes

# Non-Routine Flaring
est_non_routine_flaring <- panel_fv %>%
  filter(Routine_Flaring_B_Day == 0) # 591/4770 = 12.39%

panel_non_routine_flaring <- panel_fv %>%
  mutate(No_Routine = ifelse(Flared_Gas_B_Day == 0, 1, 0)) %>%
  group_by(No_Routine) %>%
  summarise(Oil_B_Day = sum(Oil_B_Day, na.rm = TRUE)) %>%
  ungroup()

# Model Mistakes
est_mistake_ivent <- panel_fv %>%
  filter(IVented_Gas_B_Day < 0) # 571/4770 = 11.97%

est_mistake_uvent <- panel_fv %>%
  filter(UVented_Gas_B_Day < 0) # 1555/4770 = 32.60%

# 2.3 Final Dataset

panel_fv <- panel_fv %>%
  mutate(IVented_Gas_B_Day = ifelse(IVented_Gas_B_Day <= 0, 0, IVented_Gas_B_Day), 
         UVented_Gas_B_Day = ifelse(UVented_Gas_B_Day <= 0, 0, UVented_Gas_B_Day), 
         Vented_Gas_B_Day  = IVented_Gas_B_Day + UVented_Gas_B_Day) %>%
  select(RE_ID, Asset, Year, Longitude, Latitude, Oil_Gas, Geo,  
         Oil_B_Day, TotGas_B_Day, Sold_Gas_B_Day,
         Flared_Gas_B_Day, Non_Routine_Flaring_B_Day, Routine_Flaring_B_Day, 
         Vented_Gas_B_Day, IVented_Gas_B_Day, UVented_Gas_B_Day, 
         Insitu_Gas_B_Day, Re_Gas_B_Day, 
         GOR_SCF_B, HH_Dollars_B, WTI_Dollars_B,  
         alpha0_hat, alpha1_hat, delta2_hat, delta4_hat, delta5_hat)

tapply(panel_fv$IVented_Gas_B_Day, panel_fv$Geo, summary, digits = 2)
tapply(panel_fv$IVented_Gas_B_Day, panel_fv$Geo, sd)

tapply(panel_fv$UVented_Gas_B_Day, panel_fv$Geo, summary, digits = 2)
tapply(panel_fv$UVented_Gas_B_Day, panel_fv$Geo, sd)


#write.csv(panel_fv, file ='C:/Users/s15535/Desktop/2 - Energy Economics/2 - Oil & Natural Gas/2.5 - Flaring & Venting/1 - American FV&L/2 - Codes/1 - Upstream Emissions/Second_Stage_Results.csv', row.names = FALSE)
write.csv(panel_fv, file ='C:/Users/s15535/Desktop/2 - Energy Economics/2 - Oil & Natural Gas/2.5 - Flaring & Venting/1 - American FV&L/2 - Codes/1 - Upstream Emissions/Second_Stage_Results_NEW.csv', row.names = FALSE)

rm(list=ls(all=TRUE))

##-----------------------------------------------------------------------------------------------#
##-----------------------------------------------------------------------------------------------#