#Code to develop MLR model for the East River water table sensitivity to warming. #written by R. Carroll September 2023 import csv import pandas as pd import statsmodels.api as sm from itertools import combinations from statsmodels.stats.outliers_influence import variance_inflation_factor from sklearn.model_selection import LeaveOneOut from sklearn.metrics import mean_squared_error import numpy as np # Load your dataset into a DataFrame df = pd.read_csv('traits.csv') # Define the independent variables (features) all_independent_vars = ['area','elev','aspect','slope','east1','north1', 'east2','north2','TWI','relief','f3200','fLSA','np', 'p','ts','fp','totaldd','pdd','fb','fg','fgb','fshrub', 'fhardwood','fconifer','ff','trncf','trncfall', 'trncff1','PPT','Srsol','TMN','TMX','TAVG','VPDMN','VPDMX'] # Define the dependent variable y = df['dWT'] # Create a CSV file to store the results results_file_path = 'iteration_results.csv' # Initialize variables to track the best model and its AIC best_model = None best_aic = float('inf') # Initialize with a high value # Open the CSV file for writing with open(results_file_path, 'w', newline='') as results_file: # Create a CSV writer object csv_writer = csv.writer(results_file) # Write a header row to the CSV file # csv_writer.writerow(["Variables", "AIC", "Max VIF", "Mean LOOCV Error", "Mean Normalized Error", "R-squared", "P-value"]) csv_writer.writerow(["Variables","n", "AIC", "Max VIF", "Mean LOOCV Error", "R-squared", "P-value"]) # Create Leave-One-Out cross-validation iterator loo = LeaveOneOut() # Loop through different combinations of independent variables (up to 3) for num_vars in range(1, 5): # Consider up to 4 independent variables # Generate all combinations of independent variables of size num_vars var_combinations = combinations(all_independent_vars, num_vars) for var_combo in var_combinations: # Create a subset of independent variables based on the current combination subset_X = df[list(var_combo)] # Check if the subset has at least two data points and columns if subset_X.shape[0] >= 2 and subset_X.shape[1] >= 2: # Calculate VIF for the current subset of variables vif = pd.DataFrame() vif["Variable"] = subset_X.columns # Calculate R-squared for each variable r_squared_values = [sm.OLS(subset_X.iloc[:, i], sm.add_constant(subset_X.drop(subset_X.columns[i], axis=1))).fit().rsquared for i in range(subset_X.shape[1])] # Calculate VIF for each variable, handling divide-by-zero warning vif["VIF"] = [1. / (1. - r_squared_i) if r_squared_i < 1.0 else np.inf for r_squared_i in r_squared_values] # Check if the maximum VIF is below a threshold (e.g., 10) to avoid multicollinearity if vif['VIF'].max() <= 5: # Initialize lists to store LOOCV prediction errors and observed values loocv_errors_iter = [] normalized_errors_iter = [] # Initialize lists to store regression statistics rsquared_list = [] pvalue_list = [] # Perform LOOCV for the current model for train_index, test_index in loo.split(subset_X): X_train, X_test = subset_X.iloc[train_index], subset_X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] # Add a constant to the independent variables (intercept) X_train = sm.add_constant(X_train) X_test = sm.add_constant(X_test, has_constant='add') # Add the constant to X_test # Fit the multiple linear regression model ols_model = sm.OLS(y_train, X_train) est = ols_model.fit() # Make predictions on the left-out data point y_pred = est.predict(X_test) # Calculate the LOOCV prediction error (MSE) and normalized error loocv_error = mean_squared_error(y_test, y_pred) loocv_errors_iter.append(loocv_error) observed_range = np.ptp(y_train) # normalized_rmse = np.sqrt(loocv_error) / observed_range # normalized_errors_iter.append(normalized_rmse) # Append regression statistics to the lists rsquared_list.append(est.rsquared) pvalue_list.append(est.f_pvalue) # Calculate the mean LOOCV error and normalized error for this model mean_loocv_error = np.mean(loocv_errors_iter) # mean_normalized_error = np.mean(normalized_errors_iter) # Calculate the AIC for the current model aic = est.aic # Write the iteration results to the CSV file # csv_writer.writerow([", ".join(var_combo), aic, vif['VIF'].max(), mean_loocv_error, mean_normalized_error, # np.mean(rsquared_list), np.mean(pvalue_list)]) csv_writer.writerow([", ".join(var_combo),num_vars, aic, vif['VIF'].max(), mean_loocv_error, np.mean(rsquared_list), np.mean(pvalue_list)]) # Check if the current model has a lower AIC than the best model found so far if aic < best_aic: best_aic = aic best_model = est best_independent_vars = var_combo # Print the best model's AIC and selected independent variables print("Best AIC:", best_aic) print("Selected Independent Variables:", best_independent_vars) # Save the regression statistics for the best model to a text file with open('best_AIC_stats.txt', 'w') as text_file: text_file.write(str(best_model.summary()))