#Code to develop MLR model for the East River water table sensitivity to warming.
#written by R. Carroll September 2023

import csv
import pandas as pd
import statsmodels.api as sm
from itertools import combinations
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error
import numpy as np

# Load your dataset into a DataFrame
df = pd.read_csv('traits.csv')  

# Define the independent variables (features)
all_independent_vars = ['area','elev','aspect','slope','east1','north1',
                        'east2','north2','TWI','relief','f3200','fLSA','np',
                        'p','ts','fp','totaldd','pdd','fb','fg','fgb','fshrub',
                        'fhardwood','fconifer','ff','trncf','trncfall',
                        'trncff1','PPT','Srsol','TMN','TMX','TAVG','VPDMN','VPDMX']

# Define the dependent variable
y = df['dWT']  

# Create a CSV file to store the results
results_file_path = 'iteration_results.csv'

# Initialize variables to track the best model and its AIC
best_model = None
best_aic = float('inf')  # Initialize with a high value

# Open the CSV file for writing
with open(results_file_path, 'w', newline='') as results_file:
    # Create a CSV writer object
    csv_writer = csv.writer(results_file)
    
    # Write a header row to the CSV file
#    csv_writer.writerow(["Variables", "AIC", "Max VIF", "Mean LOOCV Error", "Mean Normalized Error", "R-squared", "P-value"])
    csv_writer.writerow(["Variables","n", "AIC", "Max VIF", "Mean LOOCV Error", "R-squared", "P-value"])

    # Create Leave-One-Out cross-validation iterator
    loo = LeaveOneOut()

    # Loop through different combinations of independent variables (up to 3)
    for num_vars in range(1, 5):  # Consider up to 4 independent variables
        # Generate all combinations of independent variables of size num_vars
        var_combinations = combinations(all_independent_vars, num_vars)

        for var_combo in var_combinations:
            # Create a subset of independent variables based on the current combination
            subset_X = df[list(var_combo)]

            # Check if the subset has at least two data points and columns
            if subset_X.shape[0] >= 2 and subset_X.shape[1] >= 2:
                # Calculate VIF for the current subset of variables
                vif = pd.DataFrame()
                vif["Variable"] = subset_X.columns

                # Calculate R-squared for each variable
                r_squared_values = [sm.OLS(subset_X.iloc[:, i], sm.add_constant(subset_X.drop(subset_X.columns[i], axis=1))).fit().rsquared for i in range(subset_X.shape[1])]

                # Calculate VIF for each variable, handling divide-by-zero warning
                vif["VIF"] = [1. / (1. - r_squared_i) if r_squared_i < 1.0 else np.inf for r_squared_i in r_squared_values]

                # Check if the maximum VIF is below a threshold (e.g., 10) to avoid multicollinearity
                if vif['VIF'].max() <= 5:
                    # Initialize lists to store LOOCV prediction errors and observed values
                    loocv_errors_iter = []
                    normalized_errors_iter = []

                    # Initialize lists to store regression statistics
                    rsquared_list = []
                    pvalue_list = []

                    # Perform LOOCV for the current model
                    for train_index, test_index in loo.split(subset_X):
                        X_train, X_test = subset_X.iloc[train_index], subset_X.iloc[test_index]
                        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

                        # Add a constant to the independent variables (intercept)
                        X_train = sm.add_constant(X_train)
                        X_test = sm.add_constant(X_test, has_constant='add')  # Add the constant to X_test

                        # Fit the multiple linear regression model
                        ols_model = sm.OLS(y_train, X_train)
                        est = ols_model.fit()

                        # Make predictions on the left-out data point
                        y_pred = est.predict(X_test)

                        # Calculate the LOOCV prediction error (MSE) and normalized error
                        loocv_error = mean_squared_error(y_test, y_pred)
                        loocv_errors_iter.append(loocv_error)

                        observed_range = np.ptp(y_train)
#                        normalized_rmse = np.sqrt(loocv_error) / observed_range
#                        normalized_errors_iter.append(normalized_rmse)
                        
                        # Append regression statistics to the lists
                        rsquared_list.append(est.rsquared)
                        pvalue_list.append(est.f_pvalue)

                    # Calculate the mean LOOCV error and normalized error for this model
                    mean_loocv_error = np.mean(loocv_errors_iter)
#                    mean_normalized_error = np.mean(normalized_errors_iter)

                    # Calculate the AIC for the current model
                    aic = est.aic

                    # Write the iteration results to the CSV file
 #                   csv_writer.writerow([", ".join(var_combo), aic, vif['VIF'].max(), mean_loocv_error, mean_normalized_error,
 #                                         np.mean(rsquared_list), np.mean(pvalue_list)])

                    csv_writer.writerow([", ".join(var_combo),num_vars, aic, vif['VIF'].max(), mean_loocv_error, 
                                        np.mean(rsquared_list), np.mean(pvalue_list)])
                    # Check if the current model has a lower AIC than the best model found so far
                    if aic < best_aic:
                        best_aic = aic
                        best_model = est
                        best_independent_vars = var_combo

# Print the best model's AIC and selected independent variables
print("Best AIC:", best_aic)
print("Selected Independent Variables:", best_independent_vars)

# Save the regression statistics for the best model to a text file
with open('best_AIC_stats.txt', 'w') as text_file:
    text_file.write(str(best_model.summary()))