library(ggplot2) ###### 1) read in and data preparation ###### saline<- read.table("Lakes_categories_analysis.csv", dec = ".", sep = ",", header = T) services<- read.table("no_of_services.csv", dec = ",", sep = ",", header = T) sal.work<-saline str(sal.work) colnames(sal.work)[1:14]<-c("remoteness", "lake", "surface", "volume", "s_v_ratio","max_depth","av_depth", "salinity","salinity_var", "ephemeral","fish_p", "protected", "endangered_sp","endemic_sp") # synchronise the two different data-sets... services<-services[match(sal.work$lake,services$Lake),] sal.work$totalservice2<-rowSums(services[,c(5:7,9:14)]) # check for colinearity in predictors - that is fine... cor(sal.work[is.na(sal.work$salinity_var)==F,match(c("remoteness", "surface", "volume", "salinity", "salinity_var"), colnames(sal.work))]) # sort out important dependent variables # first total number of services # sort out important predictors # "remoteness", "surface", "volume", "salinity","salinity_var", "protected", water_type, sal_range sal.work$sal_range<- sal.work$salinity * sal.work$salinity_var table(sal.work$continentalsaline) table(sal.work$continentalsoda) table(sal.work$seawater) sal.work$water_type<- "soda"; sal.work$water_type[sal.work$continentalsaline==1]<-"saline" sal.work$water_type[sal.work$seawater==1]<-"seawater" # remove arctic lakes services$Long<-as.numeric(services$Long) services$Lat<-as.numeric(services$Lat) eliminate<-services$Lake [which(services$Lat<=c(-60))] services<- services[-match(eliminate, services$Lake),] sal.work<- sal.work[-match(eliminate, sal.work$lake),] ###### 2) Drivers of the number of ecosystem services provided per lake ###### # first linearisation: different models for transformed predictors were compared # transformations that improved the AIC of the full model, are listed below sal.work.log<-sal.work sal.work.sqr<-sal.work sal.work.log$sal_range<-log(sal.work$remoteness) sal.work.sqr$sal_range<-(sal.work$remoteness)^2 reference.model<-lm(data = sal.work, totalservice2 ~ water_type + remoteness + surface + volume + surface:volume + salinity + salinity_var + protected + sal_range) trans.log<-lm(data = sal.work.log, totalservice2 ~ water_type + remoteness + surface + volume + surface:volume + salinity + salinity_var + protected + sal_range) trans.sqr<-lm(data = sal.work.sqr, totalservice2 ~ water_type + remoteness + surface + volume + surface:volume + salinity + salinity_var + protected + sal_range) AIC(reference.model, trans.log, trans.sqr) summary(reference.model) # required transformations are listed here: sal.work.mod<-sal.work sal.work.mod$surface<-log(sal.work$surface) sal.work.mod$volume<-log(sal.work$volume) sal.work.mod$salinity<-log(sal.work$salinity) sal.work.mod$sal_range<-log(sal.work$sal_range) sal.work.mod$remoteness<-log(sal.work$remoteness) # standardise to compare predictor effect sizes: predictors.std<-which(is.element(colnames(sal.work.mod), c("remoteness", "surface", "volume", "salinity", "salinity_var"))) for(i in predictors.std){sal.work.mod[,i]<-as.numeric(scale(sal.work.mod[,i])) } # The end result of full-model building selection process: working.hypoth<-lm(data = sal.work.mod, totalservice2 ~ remoteness + surface ) summary(working.hypoth) mod.1<-lm(data = sal.work.mod, totalservice2 ~ remoteness + surface ) AIC(working.hypoth, mod.1) plot(mod.1) # get data required for plotting res<-residuals(mod.1) plotdata<-data.frame(residuals = residuals(mod.1)) plotdata$remote_data<- sal.work.mod$remoteness * mod.1$coefficients[2] + mod.1$coefficients[1] + residuals(mod.1) plotdata$surface_data<- sal.work.mod$surface * mod.1$coefficients[3] + mod.1$coefficients[1] + residuals(mod.1) plotdata$remoteness<- sal.work$remoteness plotdata$surface<- sal.work$surface plotdata$remote_effect<- sal.work.mod$remoteness * mod.1$coefficients[2] + mod.1$coefficients[1] plotdata$surface_effect<- sal.work.mod$surface * mod.1$coefficients[3] + mod.1$coefficients[1] new.surface<-data.frame(surface = sal.work.mod$surface, remoteness=mean(sal.work.mod$remoteness)) x<-as.data.frame(predict(mod.1, new.surface, interval = "confidence", level = 0.95)) plotdata$surface.upr<-x$upr; plotdata$surface.lwr<-x$lwr new.remoteness<-data.frame(remoteness = sal.work.mod$remoteness, surface=mean(sal.work.mod$surface)) x<-as.data.frame(predict(mod.1, new.remoteness, interval = "confidence", level = 0.95)) plotdata$rem.upr<-x$upr; plotdata$rem.lwr<-x$lwr # plot the impact of both predictors, accounting for impact of the not-displayed predictor library(ggplot2) ggplot(data = plotdata, aes(ymin=surface.lwr, ymax = surface.upr, y=surface_data, x=log10(surface))) + geom_line(aes(y=surface_effect), colour = "darkred") + theme_bw() + geom_ribbon( alpha = 0.4, fill = "darkred") +geom_point(size = 2.5) ggplot(data = plotdata, aes(y=remote_data, x= (remoteness))) + geom_line(aes(y=remote_effect), colour = "darkred") + theme_bw() + geom_ribbon(aes(ymin=rem.lwr, ymax = rem.upr), alpha = 0.4, fill = "darkred") + geom_point(size = 2.3) colnames(plotdata) ###### 3) Drivers of endangered species ###### species<-read.table("Endangered_sp.csv", dec = ".", sep = ",", header = T) # try to get the lake names synchronized between different data sets (account for spelling mistakes, etc) species$Lake[species$Lake=="Sambhar Salt Lake"]<-"The Sambhar Salt Lake" species$Lake[species$Lake=="Seewinkel Soda Pans"]<-"Soda pans in the Seewinkel district" species$Lake[species$Lake=="Ace lake"]<-"Ace Lake" species$Lake[species$Lake=="Slano Kopovo"]<-"S\xa2skop\xa2 or Slano Kopovo" species$Lake[species$Lake=="Aydar-Arnasay lake system"]<-"Aydar-Arnasay lake system (AALS)" species$Lake[species$Lake=="Lake Zun-Torey"]<-"Lake Zun-Torey (Torey lakes)" species$Lake[species$Lake=="Great Salt Lake"]<-"Great Salt lake" species$Lake[species$Lake=="Lake Barun-Torey"]<-"Lake Barun-Torey (Torey lakes)" species$Lake[species$Lake=="Larnaca Salt Lake"]<-"Larnaca Lake" species$Lake[species$Lake=="Mar Chiquita Lake"]<-"Lake Mar Chiquita" species$Lake[species$Lake=="Bain-Tsagan lake"]<-"Lake Bain-Tsagan" species$Lake[species$Lake=="Alligator reservoir"]<-"Queimadas" # get rid of arctic lakes species<- species[-which(is.element(species$Lake,eliminate)),] # check whether we have any mismatches: lakes<-unique(species$Lake) lakes[(is.na(match(lakes, sal.work$lake)))] sal.work$lake[which(is.na(match(sal.work$lake,lakes)))] # determine the number of endangered species per lake end<-c() for (i in lakes) {x<-length(which((species$Endangered.species[species$Lake==i])!="")) end<-c(end,x)} end<-data.frame(lakes, end) # add the information to the sal.work data frame end<-end[match(sal.work$lake,end$lakes),] sal.work$end_no<-end$end # explore descirptive statistics mean(sal.work$end_no) sd(sal.work$end_no) median((sal.work$end_no)) hist(sal.work$end_no) ### start model selection process # add water_type variable sal.work$water_type<- "soda"; sal.work$water_type[sal.work$continentalsaline==1]<-"saline" sal.work$water_type[sal.work$seawater==1]<-"seawater" # first linearisation: different models for transformed predictors were compared # transformations that improved the AIC of the full model, are listed below sal.work.log<-sal.work sal.work.sqr<-sal.work sal.work.log$salinity_var<-log(sal.work$salinity_var) sal.work.sqr$salinity_var<-(sal.work$salinity_var)^2 reference.model<-glm(data = sal.work, end_no ~ water_type + remoteness + surface + volume + surface:volume + salinity + salinity_var + protected, family = poisson(link="log")) trans.log<-glm(data = sal.work.log, end_no ~ water_type + remoteness + surface + volume + surface:volume + salinity + salinity_var + protected, family = poisson(link="log")) trans.sqr<-glm(data = sal.work.sqr, end_no ~ water_type + remoteness + surface + volume + surface:volume + salinity + salinity_var + protected, family = poisson(link="log")) AIC(reference.model, trans.log, trans.sqr) summary(reference.model) # required transformations: sal.work.mod<-sal.work sal.work.mod$salinity<-log(sal.work$salinity) sal.work.mod$remoteness<-log(sal.work$remoteness) sal.work.mod$surface<-(sal.work$surface)^2 sal.work.mod$volume<-(sal.work$volume)^2 # standardise to compare predictor effect sizes: predictors.std<-which(is.element(colnames(sal.work.mod), c("remoteness", "surface", "volume", "salinity", "salinity_var"))) for(i in predictors.std){sal.work.mod[,i]<-scale(sal.work.mod[,i]) } # start model selection process library(glmmTMB) ###### A) first trial - normal regression stand<-lm(data = sal.work.mod, end_no ~ water_type + remoteness + surface + volume + surface:volume + salinity + salinity_var + protected) plot(stand) # that is terrible!! ###### B) second trial glm model - with poission model, we have over-dispersion and a problem with 0 inflation. This sloves the problem working.hypoth<-glmmTMB(data = sal.work.mod, end_no ~ water_type + remoteness + surface + volume + surface:volume + salinity + salinity_var + protected, family = nbinom2(link="log"), zi = ~0) working.hypoth1<-glmmTMB(data = sal.work.mod, end_no ~ water_type + remoteness + surface + volume + surface:volume + salinity + salinity_var + protected, family = nbinom2(link="log"), zi = ~1) working.hypoth2<-glmmTMB(data = sal.work.mod, end_no ~ water_type + remoteness + surface + volume + surface:volume + salinity + salinity_var + protected, zi = ~1) AIC(working.hypoth , working.hypoth1,working.hypoth2) library(DHARMa) # check residuals simulateResiduals(working.hypoth, plot = T) # okay - that seems to have solved the problems - check a bit further: summary(working.hypoth) # wow, there are a lot of things not significant anymore... shapiro.test(residuals(working.hypoth1)) hist(residuals(working.hypoth1), breaks = 20) plot(residuals(working.hypoth1)~ predict(working.hypoth1)) plot(residuals(working.hypoth)~ predict(working.hypoth)) # that is much better, but there seems still the problem with the increasing residual variance with the fitted values... ##### C) Try gls regressions library(lme4); library(nlme) # next step: check for the reason why the gls does not work for the full model... working.hypoth3<-gls(end_no ~ water_type + remoteness + surface + volume + surface:volume + salinity + protected + salinity_var, data = sal.work.mod[is.na(sal.work.mod$salinity_var)==F,], weights=varExp(form=~ as.numeric(salinity_var)), method = "REML") plot(working.hypoth3) # we have two problems here - first we have all kind of convergence issues and second the residuals do not look good -> let's stay with # the glm approach, seems more robust! # that is a nice way to test for overdispersion: library(AER) w<-glm(data = sal.work.mod, end_no ~ water_type + remoteness + surface + volume + surface:volume + salinity + salinity_var + protected, family = poisson(link="log")) dispersiontest(w,trafo=1) ##### D) Model selection # search other simpler models mod1<-glmmTMB(data = sal.work.mod, end_no ~ remoteness + salinity + surface + volume + surface:volume + protected, family = nbinom2(link="log"), zi = ~0) mod2<-glmmTMB(data = sal.work.mod, end_no ~ remoteness + salinity +water_type + surface + volume + surface:volume , family = nbinom2(link="log"), zi = ~0) mod3<-glmmTMB(data = sal.work.mod, end_no ~ salinity + volume + surface:volume , family = nbinom2(link="log"), zi = ~0) mod4<-glmmTMB(data = sal.work.mod, end_no ~ salinity + volume , family = nbinom2(link="log"), zi = ~0) mod5<-glmmTMB(data = sal.work.mod, end_no ~ salinity , family = nbinom2(link="log"), zi = ~0) mod6<-glmmTMB(data = sal.work.mod, end_no ~ salinity + remoteness, family = nbinom2(link="log"), zi = ~0) mod7<-glmmTMB(data = sal.work.mod, end_no ~ salinity + remoteness+ volume, family = nbinom2(link="log"), zi = ~0) mod8<-glmmTMB(data = sal.work.mod, end_no ~ salinity + remoteness+ volume + surface:volume, family = nbinom2(link="log"), zi = ~0) mod9<-glmmTMB(data = sal.work.mod, end_no ~ salinity + remoteness+ volume + surface:volume + surface, family = nbinom2(link="log"), zi = ~0) AIC(mod0,mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8,mod9) BIC(mod0,mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8,mod9) # get R2 library("insight"); library("performance") performance::r2(mod7) performance::r2(mod5) summary(mod7) # according to AIC model selection, the best model. But there are several models with similar good fit. # The best AIC model has a clearly higher R2 than the best BIC model. Based on the sample size, we could justify either # of the two model selection approaches. But the higher R2 points into the direction that the BIC maybe still penalises # quite heavily. Hence, we go for an AIC-based model selection. # get change along the range of predictor(want to have that for salinity): range.std<-function(x){(x-min(x, na.rm = T))/(max(x, na.rm = T)-min(x, na.rm = T))} mod<- glmmTMB(data = sal.work.mod, end_no ~ range.std(salinity) + range.std(remoteness)+ range.std(volume), family = nbinom2(link="log"), zi = ~0) summary(mod) #2.7 species less across the range summary(sal.work.mod$end_no) a<-summary(mod7) b<-a$coefficients$cond # plot results plotdat<- data.frame(coefficients=b[,1], confint(mod7)[-5,]) colnames(plotdat)[2:3]<-c("lwr", "upr") plotdat$predictors<-rownames(plotdat) plotdat$predictors<-factor(plotdat$predictors, levels = rev(c("(Intercept)","water_typeseawater", "water_typesoda", "protected", "salinity","salinity_var", "remoteness","surface",'volume',"surface:volume"))) colour_code3.1<- c( rgb(200,217,251,maxColorValue=255),rgb(249,228,119,maxColorValue=255),rgb(246,202,225,maxColorValue=255)) #450*250 library(ggplot2) ggplot(data = plotdat[-1,], aes(x = coefficients, y = predictors, fill = predictors)) + geom_errorbarh(aes(xmin = lwr, xmax = upr), colour = colour_code3.1, height = 0, size = 1.4) + geom_vline(xintercept = 0, colour = 'darkgrey', linetype= 'dashed') + geom_point(size=3.4, shape =21, colour = "black") + theme_bw()+ scale_fill_manual(values = colour_code3.1[c(3,2,1)]) ###### 4) Salinity as predictor of individual ecosystem services ###### # load in data and check patterns services<-services[match(sal.work$lake,services$Lake),] table(services$Ramsar, services$Nationally_protected) # three were Ramsar but not nationally protected services$protected<-0; services$protected[which(services$Ramsar==1 | services$Nationally_protected==1)]<-1 e_services<-colnames(services[,c(4:7,9:14,17)]) services$salinity<-sal.work$salinity sal<-c(); serv<-c() for(i in e_services){col<-which(colnames(services)==i) sal<-c(sal,services$salinity[services[,i]==1]) serv<-c(serv, rep(i, length(services$salinity[services[,i]==1])))} plotdat<-data.frame(sal, serv) plotdat$serv<-factor(plotdat$serv, levels = c('salts_extraction', 'Other_cultural_services', 'Recreation', 'Science','Catchment_Water', 'Lake_water','waste_water_discharge','protected' ,'End_species','biomass_harvest', 'Fishery')) x<-aggregate(plotdat$sal, by=list(plotdat$serv), function(x) mean(x)) rm(x, sal, serv,e_services) library(ggplot2); library(ggridges);library(viridis); library(hrbrthemes) rbPal <- colorRampPalette(col); n <- 12 col <- rbPal(n) ggplot(data = plotdat, aes(x= sal, y = serv, fill = ..x.. )) + theme_bw() + geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) + xlim(0,500) + scale_fill_gradient(limits = c(0, 500), low = rgb(130,201,246,maxColorValue =255), high =rgb(32,61,120,maxColorValue =255)) colour_code10<- c( rgb(0,118,174,maxColorValue=255),rgb(255,116,0,maxColorValue=255),rgb(0,161,59,maxColorValue=255), rgb(239,0,0,maxColorValue=255),rgb(157,99,181,maxColorValue=255),rgb(152,82,70,maxColorValue=255), rgb(246,110,185,maxColorValue=255),rgb(127,124,119,maxColorValue=255),rgb(194,190,44,maxColorValue=255), rgb(0,194,205,maxColorValue=255)) colour_code3<- c( rgb(233,152,51,maxColorValue=255),rgb(119,145,89,maxColorValue=255),rgb(72,108,140,maxColorValue=255)) # 400*700 plotdat$num<-as.numeric(plotdat$serv) ggplot(data = plotdat, aes(x= sal, fill =serv)) + theme_bw() + geom_boxplot(color="black", alpha=0.80, width = 0.2,position=position_dodge(0.35))+ scale_fill_manual(values = c(colour_code3[rev(c(3,3,2,2,2,3,3,1,1,1,3))])) ### second part - run the logistic regressions: # we always consider three potential forms fo transformations: services$salinity.log<-log(sal.work$salinity+1) services$salinity.sqr<-(sal.work$salinity)^2 services$salinity.uni<-poly(sal.work$salinity,2)[,2] plot(sal.work$salinity, services$salinity.log) plot(sal.work$salinity, services$salinity.uni) colnames(services[,c(4:7,9:15)]) # not significant: Catchment_Water, waste_water_discharge, biomass_harvest, Recreation, Science, Other_cultural_services mod.1<-glm(data = services, protected ~ salinity, family = "binomial") mod.2<-glm(data = services, protected ~ salinity.log, family = "binomial") mod.3<-glm(data = services, protected ~ salinity.sqr, family = "binomial") mod.4<-glm(data = services, protected ~ salinity.uni, family = "binomial") AIC(mod.1,mod.2,mod.3,mod.4) summary(mod.1) mod.1<-glm(data = services, Lake_water ~ salinity, family = "binomial") mod.2<-glm(data = services, Lake_water ~ salinity.log, family = "binomial") mod.3<-glm(data = services, Lake_water ~ salinity.sqr, family = "binomial") mod.4<-glm(data = services, Lake_water ~ salinity.uni, family = "binomial") AIC(mod.1,mod.2,mod.3,mod.4) summary(mod.4) mod.1<-glm(data = services, Fishery ~ salinity, family = "binomial") mod.2<-glm(data = services, Fishery ~ salinity.log, family = "binomial") mod.3<-glm(data = services, Fishery ~ salinity.sqr, family = "binomial") mod.4<-glm(data = services, Fishery ~ salinity.uni, family = "binomial") AIC(mod.1,mod.2,mod.3,mod.4) summary(mod.2) mod.1<-glm(data = services, End_species ~ salinity, family = "binomial") mod.2<-glm(data = services, End_species ~ salinity.log, family = "binomial") mod.3<-glm(data = services, End_species ~ salinity.sqr, family = "binomial") mod.4<-glm(data = services, End_species ~ salinity.uni, family = "binomial") AIC(mod.1,mod.2,mod.3,mod.4) summary(mod.2) # After running the regressions for all ecosystem services, these are the significant models mod.lw<-glm(data = services, Lake_water ~ salinity.uni, family = "binomial") mod.fish<-glm(data = services, Fishery ~ salinity.log, family = "binomial") mod.extr<-glm(data = services, salts_extraction ~ salinity.log, family = "binomial") #mod.protected<-glm(data = services, protected ~ salinity, family = "binomial") mod.end<-glm(data = services, End_species ~ salinity.log, family = "binomial") summary(mod.end) # plotting the results: # create prediction intervals preds<-predict(mod.end, interval = "confidence" , type = "link", se.fit = TRUE) critval <- 1.96 ## approx 95% CI services$upr<- exp(preds$fit+critval*preds$se.fit)/ (1+exp(preds$fit+critval*preds$se.fit)) services$lwr<- exp(preds$fit-critval*preds$se.fit)/ (1+exp(preds$fit-critval*preds$se.fit)) services$fit_final<- exp(preds$fit)/(1+exp(preds$fit)) #320 * 230 or 320 * 190 library(ggplot2) ggplot(data = services, aes(x =salinity, y = mod.end ))+ geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "darkred", alpha = 0.3)+ geom_line(aes(y=fit_final), colour = "darkred", size = 0.8) + geom_point(alpha = 0.4) + theme_bw() preds<-predict(mod.fish, interval = "confidence" , type = "link", se.fit = TRUE) critval <- 1.96 ## approx 95% CI services$upr<- exp(preds$fit+critval*preds$se.fit)/ (1+exp(preds$fit+critval*preds$se.fit)) services$lwr<- exp(preds$fit-critval*preds$se.fit)/ (1+exp(preds$fit-critval*preds$se.fit)) services$fit_final<- exp(preds$fit)/(1+exp(preds$fit)) ggplot(data = services, aes(x =salinity, y = Fishery ))+ geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "darkred", alpha = 0.3)+ geom_line(aes(y=fit_final), colour = "darkred", size = 0.8) + geom_point(alpha = 0.4) + theme_bw() preds<-predict(mod.extr, interval = "confidence" , type = "link", se.fit = TRUE) critval <- 1.96 ## approx 95% CI services$upr<- exp(preds$fit+critval*preds$se.fit)/ (1+exp(preds$fit+critval*preds$se.fit)) services$lwr<- exp(preds$fit-critval*preds$se.fit)/ (1+exp(preds$fit-critval*preds$se.fit)) services$fit_final<- exp(preds$fit)/(1+exp(preds$fit)) ggplot(data = services, aes(x =salinity, y = salts_extraction ))+ geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "darkred", alpha = 0.3)+ geom_line(aes(y=fit_final), colour = "darkred", size = 0.8) + geom_point(alpha = 0.4) + theme_bw() preds<-predict(mod.lw, interval = "confidence" , type = "link", se.fit = TRUE) critval <- 1.96 ## approx 95% CI services$upr<- exp(preds$fit+critval*preds$se.fit)/ (1+exp(preds$fit+critval*preds$se.fit)) services$lwr<- exp(preds$fit-critval*preds$se.fit)/ (1+exp(preds$fit-critval*preds$se.fit)) services$fit_final<- exp(preds$fit)/(1+exp(preds$fit)) library(ggplot2) ggplot(data = services, aes(x =salinity, y = Lake_water ))+ geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "darkred", alpha = 0.3)+ geom_line(aes(y=fit_final), colour = "darkred", size = 0.8) + geom_point(alpha = 0.4) + theme_bw() preds<-predict(mod.end, interval = "confidence" , type = "link", se.fit = TRUE) critval <- 1.96 ## approx 95% CI services$upr<- exp(preds$fit+critval*preds$se.fit)/ (1+exp(preds$fit+critval*preds$se.fit)) services$lwr<- exp(preds$fit-critval*preds$se.fit)/ (1+exp(preds$fit-critval*preds$se.fit)) services$fit_final<- exp(preds$fit)/(1+exp(preds$fit)) library(ggplot2) ggplot(data = services, aes(x =salinity, y = End_species ))+ geom_ribbon(aes(ymin = lwr, ymax = upr), fill = "darkred", alpha = 0.3)+ geom_line(aes(y=fit_final), colour = "darkred", size = 0.8) + geom_point(alpha = 0.4) + theme_bw()