##-----------------------------------------------------------------------------------------------# ## Date: 16-09-2024 ## Author: Giacomo Benini ## Institution: ERE, NHH - ERE, Stanford ## ## Project: "The Free lunch of Climate Change Mitigation" ## R-codes: "Regression Analysis" ##-----------------------------------------------------------------------------------------------# rm(list = ls(all=TRUE)) options('scipen' = 100, 'digits' = 4) ##-----------------------------------------------------------------------------------------------# ## Load required packages ##-----------------------------------------------------------------------------------------------# library(tidyverse) library(ggplot2) library(fuzzyjoin) library(scales) library(Matrix) library(TMB) library(data.table) library(plm) setwd("C:/Users/s15535/Desktop/2 - Energy Economics/2 - Oil & Natural Gas/2.5 - Flaring & Venting/1 - American FV&L/2 - Codes/1 - Upstream Emissions") ##-----------------------------------------------------------------------------------------------# ## 1. First Stage Regression (Tobit Flaring) ##-----------------------------------------------------------------------------------------------# panel <- read.csv('C:/Users/s15535/Desktop/2 - Energy Economics/2 - Oil & Natural Gas/2.5 - Flaring & Venting/1 - American FV&L/2 - Codes/1 - Upstream Emissions/F&V_Dataset.csv', sep = ',', header = TRUE, stringsAsFactors = FALSE, encoding = 'UTF-8') # 1.1 Time frame Start <- 2005 End <- 2020 panel <- panel %>% filter(Year >= Start & Year <= End) # 1.2 Definition of Routine Flaring panel_gas <- panel %>% filter(Geo == 'Coalbed Methane' | Geo == 'Shale & Tight Gas') %>% mutate(Flaring_Rate = (Flared_Gas_B_Day / TotGas_B_Day)) %>% filter(Flaring_Rate <= 0.0516289279) quantile(panel_gas$Flaring_Rate, probs = seq(0, 1, 0.05), na.rm = TRUE, names = TRUE) # 1.3 Rystad vs EIA panel_oil <- panel %>% filter(Geo == 'Heavy & Extra Heavy' | Geo == 'Light & Medium' | Geo == 'Other Oil' | Geo == 'Shale & Tight Oil') %>% filter(GOR_SCF_B < 100000) %>% # Threshold defined by the EIA definition of an oilfield filter(!is.na(Flared_Gas_B_Day)) %>% mutate(Non_Routine_Flaring_B_Day = ifelse(quantile(panel_gas$Flaring_Rate, probs = 0.5, na.rm = TRUE) * TotGas_B_Day <= Flared_Gas_B_Day, quantile(panel_gas$Flaring_Rate, probs = 0.5, na.rm = TRUE) * TotGas_B_Day, Flared_Gas_B_Day), Routine_Flaring_B_Day = ifelse(Flared_Gas_B_Day - Non_Routine_Flaring_B_Day <= 0, 0, Flared_Gas_B_Day - Non_Routine_Flaring_B_Day), Dummy = ifelse(Routine_Flaring_B_Day == 0, 0, 1)) %>% distinct() summary(panel_oil$Flared_Gas_B_Day - panel_oil$Non_Routine_Flaring_B_Day - panel_oil$Routine_Flaring_B_Day) # 1.2 Panel flare panel_flare <- panel_oil %>% select(RE_ID, Asset, Year, Flared_Gas_B_Day, HH_Dollars_B, Dummy) # 1.3 Set up optimization matrix price <- panel_flare %>% group_by(Year) %>% summarise(price = mean(HH_Dollars_B)) %>% pull y_tible <- panel_flare %>% pivot_wider(id_cols = RE_ID, values_from = Flared_Gas_B_Day, names_from = Year) %>% select(c('RE_ID', as.character(Start:End))) ID <- y_tible$RE_ID y <- y_tible %<>% select(-RE_ID) %>% as.matrix rownames(y) = ID censor_tible <- panel_flare %>% pivot_wider(id_cols = RE_ID, values_from = Dummy, names_from = Year) %>% select(c('RE_ID', as.character(Start:End))) ID <- censor_tible$RE_ID censor <- censor_tible %<>% select(-RE_ID) %>% as.matrix rownames(censor) = ID data <- list(y = y, censor = censor, price = price) # (reasonable) initial values n <- nrow(y) time <- ncol(y) alpha0 <- rep(0, n) sigma <- sd(y, na.rm = TRUE) sigma_alpha <- sd(colMeans(y, na.rm = TRUE)) alpha1 <- 0 parameters <- list(alpha1 = alpha1, log_sigma = log(sigma), log_sigma_alpha = log(sigma_alpha), alpha0 = alpha0) # Compile likelihood compile("flare.cpp") dyn.load(dynlib("flare")) # Fit the model obj <- MakeADFun(data = data, parameters = parameters, random = "alpha0", DLL = "flare") obj$fn(obj$par) # Optimizing opt <- nlminb(obj$par, obj$fn, control = list(trace = TRUE)) opt # SDreport sdrep <- sdreport(obj, par.fixed = opt$par) res <- summary(sdrep, "report") # Looks good # Get random effects alpha0 = obj$env$report()$alpha0 hist(alpha0) # Get the errors error <- matrix(NA, nrow = n, ncol = time) alpha1 <- res[1,1] for(i in 1:n){ for(t in 1:time){ error[i,t] <- y[i,t] - alpha0[i] - alpha1*price[t] } } hist(error) response <- matrix(NA, nrow = n, ncol = time) for(i in 1:n){ for(t in 1:time){ response[i,t] <- alpha0[i] + alpha1*price[t] } } hist(response) # 1.5 Analysis of the First-Sage Correction Term error <- as.data.table(error) %>% pivot_longer(everything()) %>% rename(eta_hat = value) response <- as.data.table(response) %>% pivot_longer(everything()) %>% rename(Flared_Gas_B_Day_hat = value) est <- bind_cols(error, response) %>% drop_na() %>% bind_cols(panel_flare, .) %>% select(RE_ID, Asset, Year, eta_hat, Flared_Gas_B_Day_hat) alpha <- as.data.table(alpha0) %>% pivot_longer(everything()) %>% rename(alpha0_hat = value) panel_RE_ID <- panel_flare %>% distinct(RE_ID) %>% bind_cols(., alpha) %>% select(RE_ID, alpha0_hat) # 1.6 Final Dataset panel_oil <- left_join(panel_oil, est) %>% left_join(., panel_RE_ID) %>% mutate(x = -Flared_Gas_B_Day_hat/sigma, eta_hat = ifelse(Routine_Flaring_B_Day == 0, -sigma*dnorm(x,0,1)/pnorm(x,0,1), eta_hat), eta_hat_s = (eta_hat - mean(eta_hat)) / sd(eta_hat), Inter_1 = HH_Dollars_B * Dummy, Inter_2 = eta_hat * Dummy, Inter_3 = HH_Dollars_B * Re_Gas_B_Day, Inter_4 = eta_hat_s * Re_Gas_B_Day, Inter_5 = HH_Dollars_B * Insitu_Gas_B_Day, Inter_6 = eta_hat_s * Insitu_Gas_B_Day) %>% distinct() %>% drop_na(Inter_1, Inter_2, Inter_3, Inter_4, Inter_5, Inter_6) %>% mutate(alpha1_hat = alpha1) %>% select(RE_ID, Asset, Year, Longitude, Latitude, Oil_Gas, Geo, Oil_B_Day, TotGas_B_Day, Sold_Gas_B_Day, Flared_Gas_B_Day, Non_Routine_Flaring_B_Day, Routine_Flaring_B_Day, Other_Gas_B_Day, Insitu_Gas_B_Day, Re_Gas_B_Day, HH_Dollars_B, Lag_HH_Future_Dollar_B, WTI_Dollars_B, eta_hat, Dummy, Inter_1, Inter_2, Inter_3, Inter_4, Inter_5, Inter_6, alpha0_hat, alpha1_hat, GOR_SCF_B) rm(censor, censor_tible, data, error, est, obj, panel, panel_flare, parameters, res, sdrep, y, y_tible, alpha0, i, ID, n, opt, price, response, sigma, sigma_alpha, t, time) rm(alpha1, End, Start) summary(panel_oil) # 1.7 eta_hat Analysis panel_oil <- panel_oil %>% mutate(sel = ifelse(Routine_Flaring_B_Day > 0, 'Routine Flare > 0', 'Otherwise'), sel_geo = interaction(sel, Geo, sep = '-')) tapply(panel_oil$eta_hat, panel_oil$sel, summary) tapply(panel_oil$eta_hat, panel_oil$sel, quantile, probs = 0:10/10) tapply(panel_oil$eta_hat, panel_oil$sel, sd) tapply(panel_oil$eta_hat, panel_oil$sel_geo, summary, digits = 4) tapply(panel_oil$eta_hat, panel_oil$sel_geo, quantile, probs = 0:10/10) tapply(panel_oil$eta_hat, panel_oil$sel_geo, sd) #test <- wlicox.test(panel_oil$eta_hat ~ panel_oil$sel) #test panel_plot <- panel_oil %>% filter(sel == 'Routine Flare > 0') max(panel_plot$eta_hat, na.rm = TRUE) / min(panel_plot$eta_hat, na.rm = TRUE) # 1.8 eta_hat Plot dens_eta <- ggplot(data = panel_plot) + geom_density(aes(x = eta_hat), size = 1) + theme_minimal() + xlim(-100, 500) + labs(x = 'BOE/Day', y = 'Empirical Density') + theme(legend.direction = "horizontal", legend.position = "bottom", axis.text.x = element_text(angle = 90, hjust = 1)) + guides(fill = guide_legend(title.position = 'top', title.hjust = 0.5, label.position = 'top')) dens_eta library(RColorBrewer) nb.cols <- 6 # Define the number of colors you want mycolors <- colorRampPalette(brewer.pal(11, "RdBu"))(nb.cols) dens_eta_geo <- ggplot(data = panel_plot) + geom_density(aes(x = eta_hat, color = Geo), size = 1) + theme_minimal() + xlim(-100, 200) + scale_color_manual(values = mycolors) + labs(x = 'BOE/Day', y = 'Empirical Density') + theme(legend.direction = "horizontal", legend.position = "bottom", axis.text.x = element_text(angle = 90, hjust = 1)) + guides(fill = guide_legend(title.position = 'top', title.hjust = 0.5, label.position = 'top')) dens_eta_geo ggsave("C:/Users/s15535/Desktop/2 - Energy Economics/2 - Oil & Natural Gas/2.5 - Flaring & Venting/1 - American FV&L/0 - Manuscript/3 - A Zero-Cost Policy for Eliminating Methane Emissions in the Oil & Gas Industry - Appendix/Figure4_Appendix_EtaHatDensity.pdf", height = 15, width = 20, units = c('cm')) write.csv(panel_oil, file ='C:/Users/s15535/Desktop/2 - Energy Economics/2 - Oil & Natural Gas/2.5 - Flaring & Venting/1 - American FV&L/2 - Codes/1 - Upstream Emissions/First_Stage_Results.csv', row.names = FALSE) rm(list=ls(all=TRUE)) ##-----------------------------------------------------------------------------------------------# ## 2. Second Stage Regression (Linear Regression OtherGas) ##-----------------------------------------------------------------------------------------------# panel_oil <- read.csv('C:/Users/s15535/Desktop/2 - Energy Economics/2 - Oil & Natural Gas/2.5 - Flaring & Venting/1 - American FV&L/2 - Codes/1 - Upstream Emissions/First_Stage_Results.csv', sep = ',', header = TRUE, stringsAsFactors = FALSE, encoding = 'UTF-8') length(unique(panel_oil$RE_ID)) # 556 Oil & Gas Fields length(unique(panel_oil$Year)) # 16 Years (2005-2020) # 2.1 Explanatory Analysis of the Dependent Variable summary(panel_oil$Other_Gas_B_Day) tapply(panel_oil$Other_Gas_B_Day, panel_oil$Geo, summary, digits = 2) tapply(panel_oil$Other_Gas_B_Day, panel_oil$Geo, sd) plot(density(panel_oil$Other_Gas_B_Day, na.rm = T)) library(car) scatterplot(Other_Gas_B_Day ~ Year | Geo, boxplots = FALSE, smooth = TRUE, reg.line = FALSE, data = panel_oil) library(gplots) plotmeans(Other_Gas_B_Day ~ Year, main = "Heterogeineity across years", data = panel_oil) # 2.2 Stationary panel_sta <- plm.data(panel_oil, indexes = c('RE_ID', 'Year')) library(tseries) adf.test(panel_sta$Other_Gas_B_Day, k = 1) # no unit root present for 1 lag (p-value = 0.01 < 0.05) adf.test(panel_sta$Other_Gas_B_Day, k = 2) # no unit root present for 2 lags (p-value = 0.01 < 0.05) # 2.3 Heteroskedasticity library(lmtest) bptest(Other_Gas_B_Day ~ Dummy + Inter_1 + Inter_2 + Oil_B_Day + Lag_HH_Future_Dollar_B + eta_hat + Re_Gas_B_Day + Insitu_Gas_B_Day + Inter_3 + Inter_4 + Inter_5 + Inter_6, data = panel_oil, studentize = F) # no homoskedasticity (p-value = 0.0000000000000002 < 0.05) # 2.4 Serial Correlation among Errors pols <- plm(Other_Gas_B_Day ~ Dummy + Inter_1 + Inter_2 + Oil_B_Day + Lag_HH_Future_Dollar_B + eta_hat + Re_Gas_B_Day + Insitu_Gas_B_Day + Inter_3 + Inter_4 + Inter_5 + Inter_6, data = panel_oil, index = c('RE_ID', 'Year'), effect = 'individual', model = 'pooling') summary(pols) pbgtest(pols) # reject H0: no serial correlation in idiosyncratic errors p-value <0.0000000000000002 rem <- plm(Other_Gas_B_Day ~ Dummy + Inter_1 + Inter_2 + Oil_B_Day + Lag_HH_Future_Dollar_B + eta_hat + Re_Gas_B_Day + Insitu_Gas_B_Day + Inter_3 + Inter_4 + Inter_5 + Inter_6, data = panel_oil, index = c('RE_ID', 'Year'), effect = 'individual', model = 'random') summary(rem) pbgtest(rem) # reject H0: no serial correlation in idiosyncratic errors p-value <0.0000000000000002 fem <- plm(Other_Gas_B_Day ~ Dummy + Inter_1 + Inter_2 + Oil_B_Day + Lag_HH_Future_Dollar_B + eta_hat + Re_Gas_B_Day + Insitu_Gas_B_Day + Inter_3 + Inter_4 + Inter_5 + Inter_6, data = panel_oil, index = c('RE_ID', 'Year'), effect = 'individual', model = 'within') summary(fem) pbgtest(fem) # reject H0: no serial correlation in idiosyncratic errors p-value <0.0000000000000002 pFtest(fem, pols) phtest(fem, rem) # fixed effect model is better! # 2.5 PGGLS Method pols <- pggls(Other_Gas_B_Day ~ Dummy + Inter_1 + Inter_2 + Oil_B_Day + Lag_HH_Future_Dollar_B + eta_hat + Re_Gas_B_Day + Insitu_Gas_B_Day + Inter_3 + Inter_4 + Inter_5 + Inter_6, data = panel_oil, index = c('RE_ID', 'Year'), model = 'pooling') summary(pols) fixed <- pggls(Other_Gas_B_Day ~ 0 + Dummy + Inter_1 + Inter_2 + Oil_B_Day + Lag_HH_Future_Dollar_B + eta_hat + Re_Gas_B_Day + Insitu_Gas_B_Day + Inter_3 + Inter_4 + Inter_5 + Inter_6, data = panel_oil, index = c('RE_ID', 'Year'), effect = 'individual', model = 'within') summary(fixed) fe <- as.data.frame(fixef(fixed)) %>% mutate(RE_ID = rownames(.)) panel_fv <- left_join(panel_oil, fe) %>% mutate(Term_1 = `fixef(fixed)`, # UVent Term_2 = fixed$coefficients[1] * Dummy, # IVent Term_3 = fixed$coefficients[2] * HH_Dollars_B * Dummy, # IVent Term_4 = fixed$coefficients[3] * eta_hat * Dummy, # IVent Term_5 = fixed$coefficients[4] * Oil_B_Day, # UVent Term_6 = fixed$coefficients[5] * Lag_HH_Future_Dollar_B, # UVent Term_7 = fixed$coefficients[6] * eta_hat, delta2_hat = fixed$coefficients[2], delta4_hat = fixed$coefficients[4], delta5_hat = fixed$coefficients[5]) %>% rowwise() %>% mutate(IVented_Gas_B_Day = sum(Term_2, Term_3, Term_4), UVented_Gas_B_Day = sum(Term_1, Term_5, Term_6, Term_7)) %>% ungroup() %>% select(RE_ID, Asset, Year, Longitude, Latitude, Oil_Gas, Geo, Oil_B_Day, TotGas_B_Day, Sold_Gas_B_Day, Flared_Gas_B_Day, Non_Routine_Flaring_B_Day, Routine_Flaring_B_Day, IVented_Gas_B_Day, UVented_Gas_B_Day, Insitu_Gas_B_Day, Re_Gas_B_Day, GOR_SCF_B, HH_Dollars_B, WTI_Dollars_B, alpha0_hat, alpha1_hat, delta2_hat, delta4_hat, delta5_hat) quantile(panel_fv$Non_Routine_Flaring_B_Day, probs = seq(0, 1, 0.05), na.rm = TRUE, names = TRUE) quantile(panel_fv$Routine_Flaring_B_Day, probs = seq(0, 1, 0.05), na.rm = TRUE, names = TRUE) quantile(panel_fv$IVented_Gas_B_Day, probs = seq(0, 1, 0.05), na.rm = TRUE, names = TRUE) quantile(panel_fv$UVented_Gas_B_Day, probs = seq(0, 1, 0.05), na.rm = TRUE, names = TRUE) tapply(panel_fv$Non_Routine_Flaring_B_Day, panel_fv$Geo, summary, digits = 2) tapply(panel_fv$Non_Routine_Flaring_B_Day, panel_fv$Geo, sd) tapply(panel_fv$Routine_Flaring_B_Day, panel_fv$Geo, summary, digits = 2) tapply(panel_fv$Routine_Flaring_B_Day, panel_fv$Geo, sd) tapply(panel_fv$IVented_Gas_B_Day, panel_fv$Geo, summary, digits = 2) tapply(panel_fv$IVented_Gas_B_Day, panel_fv$Geo, sd) tapply(panel_fv$UVented_Gas_B_Day, panel_fv$Geo, summary, digits = 2) tapply(panel_fv$UVented_Gas_B_Day, panel_fv$Geo, sd) # 2.1 Model Mistakes # Non-Routine Flaring est_non_routine_flaring <- panel_fv %>% filter(Routine_Flaring_B_Day == 0) # 591/4770 = 12.39% panel_non_routine_flaring <- panel_fv %>% mutate(No_Routine = ifelse(Flared_Gas_B_Day == 0, 1, 0)) %>% group_by(No_Routine) %>% summarise(Oil_B_Day = sum(Oil_B_Day, na.rm = TRUE)) %>% ungroup() # Model Mistakes est_mistake_ivent <- panel_fv %>% filter(IVented_Gas_B_Day < 0) # 571/4770 = 11.97% est_mistake_uvent <- panel_fv %>% filter(UVented_Gas_B_Day < 0) # 1555/4770 = 32.60% # 2.3 Final Dataset panel_fv <- panel_fv %>% mutate(IVented_Gas_B_Day = ifelse(IVented_Gas_B_Day <= 0, 0, IVented_Gas_B_Day), UVented_Gas_B_Day = ifelse(UVented_Gas_B_Day <= 0, 0, UVented_Gas_B_Day), Vented_Gas_B_Day = IVented_Gas_B_Day + UVented_Gas_B_Day) %>% select(RE_ID, Asset, Year, Longitude, Latitude, Oil_Gas, Geo, Oil_B_Day, TotGas_B_Day, Sold_Gas_B_Day, Flared_Gas_B_Day, Non_Routine_Flaring_B_Day, Routine_Flaring_B_Day, Vented_Gas_B_Day, IVented_Gas_B_Day, UVented_Gas_B_Day, Insitu_Gas_B_Day, Re_Gas_B_Day, GOR_SCF_B, HH_Dollars_B, WTI_Dollars_B, alpha0_hat, alpha1_hat, delta2_hat, delta4_hat, delta5_hat) tapply(panel_fv$IVented_Gas_B_Day, panel_fv$Geo, summary, digits = 2) tapply(panel_fv$IVented_Gas_B_Day, panel_fv$Geo, sd) tapply(panel_fv$UVented_Gas_B_Day, panel_fv$Geo, summary, digits = 2) tapply(panel_fv$UVented_Gas_B_Day, panel_fv$Geo, sd) #write.csv(panel_fv, file ='C:/Users/s15535/Desktop/2 - Energy Economics/2 - Oil & Natural Gas/2.5 - Flaring & Venting/1 - American FV&L/2 - Codes/1 - Upstream Emissions/Second_Stage_Results.csv', row.names = FALSE) write.csv(panel_fv, file ='C:/Users/s15535/Desktop/2 - Energy Economics/2 - Oil & Natural Gas/2.5 - Flaring & Venting/1 - American FV&L/2 - Codes/1 - Upstream Emissions/Second_Stage_Results_NEW.csv', row.names = FALSE) rm(list=ls(all=TRUE)) ##-----------------------------------------------------------------------------------------------# ##-----------------------------------------------------------------------------------------------#