# -*- coding: utf-8 -*-
"""
Created on Mon Mar 10 20:56:07 2025

@author: drh78
"""

#import Libraries here
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations,product
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
import numpy as np
from numpy.random import seed


#read in dataframe
df = pd.read_csv("/Users/jafrankhan/Desktop/Document/My MSA/QMB 6358/mid_term_dataset.csv", delimiter= ",")
#create variable b_to_b for analysis
df['B_to_B'] = df.beds/df.baths
#create variable % sheltered for analysis
df['per_sheltered'] = df.home_size/df.parcel_size
#scale the dataset
df['price'] = df['price']/1000
#make a dataframe for the headers
df_header_graphs = ['price','year','age','beds','baths','home_size','parcel_size','pool', 'dist_cbd','dist_lakes','x_coord','y_coord','B_to_B','per_sheltered']
#same as above but without price for interaction purposes
df_header = ['year','age','beds','baths','home_size','parcel_size','pool', 'dist_cbd','dist_lakes','x_coord','y_coord','B_to_B','per_sheltered']



#Play with the data to further understand it
#basic description
data_summary = df.describe()
#correlation matrix
correlation_matrix_basic = df.corr()
#created a function with headers and data under the headers as an input
def Graphs(a,b):
    
    #create blue histograms
    plt.hist(a, bins = 100)
    plt.xlabel(b.capitalize())
    plt.ylabel('Occurrences')
    plt.title(b.capitalize() + " histogram")
    plt.show()    
    
    #create pink scatterplots
    plt.scatter(a, df["price"], color = '#E75480')
    plt.xlabel(b.capitalize())
    plt.ylabel('Home_Price (In Millions)')
    plt.title(b.capitalize() + " Scatterplot")
    plt.show()    
    return()

#call function for each aspect of the headers
for i in range(len(df_header_graphs)):
    Graphs(df[df_header_graphs[i]], df_header_graphs[i])
    
    
#randomize data
seed(1234)

#items needed for every attempt
mse = {}
x_combos = []


'''First attempt at modeling'''
#create first grouping of combinations 
for n in range(1,14):
    combos = combinations(['year','age','beds','baths','home_size','parcel_size','pool', 'dist_cbd','dist_lakes','x_coord','y_coord','B_to_B','per_sheltered'], n)
    x_combos.extend(combos)
    
#define the dependent variable
y = df['price']
    
for n in range(0, len(x_combos)):
    combo_list = list(x_combos[n])
    x = df[combo_list]
    poly = PolynomialFeatures(3) # turned out to be the sweet spot for a lot of combinations
    poly_x = poly.fit_transform(x)
    model = LinearRegression()
    cv_scores = cross_validate(model, poly_x, y, cv=10, scoring=('neg_mean_squared_error'))
    mse[str(combo_list)] = np.mean(cv_scores['test_score'])

print("Outcomes from the Best Linear Regression Model:")
min_mse = abs(max(mse.values()))
print("Minimum Average Test MSE:", min_mse.round(2))
for possibles, i in mse.items():
    if i == -min_mse:
        print("The Combination of Variables:", possibles)

#first attempt results
#'year', 'age', 'home_size', 'dist_cbd', 'dist_lakes', 'x_coord', 'y_coord', 'b_to_b', '%_sheltered' MSE when tested with whole set is 3167.774511543452 degree 3

    
    
'''Second attempt at modeling'''
#2nd attempt we wanted interaction variables to be involved so we made a correlation matrix
#with all the possible interaction variables and price so we could pick out a few to test
interaction_terms = {}

#loop through all combos of one variable
for i in range(len(df_header)):
    #loop through all of the possible things it could be combined with
    for j in range(i+1, len(df_header)):  # Avoid self-multiplication
        interaction_name = f"{df_header[i]}_x_{df_header[j]}" #get the names
        interaction_terms[interaction_name] = df[df_header[i]] * df[df_header[j]]

#turn the array into a dataframe
interaction_df = pd.DataFrame(interaction_terms)
#add price as that is our dependent variable
interaction_df['price'] = df['price'].copy()
#Time for the correlation matrix
correlation_matrix_interactions = interaction_df.corr()

#However just interaction variables won't get us a good estimate of price so we joined the two dataframes together
merged_df = pd.merge(interaction_df, df,on= 'price', how = 'inner')

#combination arrays for testing out the second attempt (used only 4 interaction variables at a time combined with the best combo of no interaction variable)
x_combos = []
for n in range(1,16):
    #we used our model that was the best average estimate in model 1 and then added four interaction variables with the strongest corrolation to price.
    #this process could have been done with more but my computer wasn't strong enough
    combos = combinations(['year', 'age', 'home_size', 'dist_cbd', 'dist_lakes', 'x_coord', 'y_coord', 'B_to_B', 'per_sheltered','year_x_home_size', 'parcel_size_x_per_sheltered'], n)
    x_combos.extend(combos)

y = merged_df['price']
# New best model with some interactions 'age', 'home_size', 'dist_cbd', 'dist_lakes', 'x_coord', 'y_coord', 'b_to_b', 'per_sheltered', 'year_x_home_size', 'parcel_size_x_per_sheltered' MSE = 2228.563 degree 3




'''Third attempt that's all about location!'''
for n in range(1,11):
    combos = combinations(['x_coord','y_coord','dist_cbd','dist_lakes','x_coord_x_y_coord','dist_lakes_x_x_coord','dist_cbd_x_x_coord', 'dist_lakes_x_y_coord','dist_cbd_x_y_coord','dist_cbd_x_dist_lakes'], n)
    x_combos.extend(combos)
    
#define the dependent variable
y = merged_df['price']
    
for n in range(0, len(x_combos)):
    combo_list = list(x_combos[n])
    x = merged_df[combo_list]
    poly = PolynomialFeatures(3) # turned out to be the sweet spot for a lot of combinations
    poly_x = poly.fit_transform(x)
    model = LinearRegression()
    cv_scores = cross_validate(model, poly_x, y, cv=10, scoring=('neg_mean_squared_error'))
    mse[str(combo_list)] = np.mean(cv_scores['test_score'])

print("Outcomes from the Best Linear Regression Model:")
min_mse = abs(max(mse.values()))
print("Minimum Average Test MSE:", min_mse.round(2))
for possibles, i in mse.items():
    if i == -min_mse:
        print("The Combination of Variables:", possibles)
        
#frankly this model was bad and the best combo was 'x_coord', 'y_coord', 'dist_cbd', 'dist_lakes' with an mse of 11350.03 buuuut it was fun to test out

'''Version 4'''
# So I loved this idea the issue was it took too long to run on my computer but seemed to work
x_combos = []
#tried to simplify the variables tested
for n in range(1,10):
    combos = combinations(['year', 'age', 'home_size', 'dist_cbd', 'dist_lakes', 'x_coord', 'y_coord', 'b_to_b', 'per_sheltered'], n)
    x_combos.extend(combos)

y = df['price']

# Exponent Values using linspace
a = np.linspace(0.1, 3, 6)

# Store MSE Results
mse = {}

#Made an exponent to test. The end_idx and start_idx were used for batch control.
def expon(end_idx, start_idx):
    #for each of the combos
    for combo in x_combos:
        #we needed all the possible combinations it could have
        exponent_combos = list(product(*[a] * len(combo)))
        
        #for each value starting at the specified index ending at the specified one for all of the combos
        for i in range(start_idx, min((end_idx), len(exponent_combos))):
            #values are = to whatever is at the poition
            exponent_values = exponent_combos[i]
            # Create a copy of df to avoid modifying the original just to be safe
            df_temp = df.copy()
            # dependent variable
            y = df_temp['price']
            # Apply exponent transformations correctly
            transformed_variable= []
            #For each variable in the combo, raise that variable to the given exponent and create a new column for it in df_temp.
            for j, variable in enumerate(combo):  # Iterate over features in combo
                #rename it
                transformed_variable_name = f"{variable}_exp"
                #raise the variable to the exponent
                df_temp[transformed_variable_name] = df_temp[variable] ** exponent_values[j]
                #add it to the transformed variable items
                transformed_variable.append(transformed_variable_name)
            # use the og values and the exponent ones
            x = df_temp[list(combo) + transformed_variable]  
            # Train & evaluate the model
            model = LinearRegression()
            cv_scores = cross_validate(model, x, y, cv=10, scoring=('neg_mean_squared_error'))
            # Store MSE results
            mse[(tuple(combo), tuple(exponent_values))] = np.mean(cv_scores['test_score'])
    
    # Find the best model in the batch
    best_features, best_exponents = max(mse, key=mse.get)
    best_mse = mse[(best_features, best_exponents)]
    
    # Print Results
    print("\n=== Best Model Found in Batch ===")
    print("Feature Combination:", best_features)
    print("Best Exponents:", best_exponents)
    print("Best MSE:", -best_mse)

expon(5, 0)
#fully think this could get us some cool models but considering the limited power I have on a laptop I'm unsure if this even works so that brought us to

'''Model 5'''
#as a note this model was not very helpful as it also didn't run well
#the goal was to remove the idea that there could be missing variables
#so made a bunch of different ranges for them
a = np.linspace(0.1, 5, 10)
b = np.linspace(0.1, 5, 10)
c = np.linspace(0.1, 5, 10)
d = np.linspace(0.1, 5, 10)
e = np.linspace(0.1, 5, 10)
f = np.linspace(0.1, 5, 10)
g = np.linspace(0.1, 5, 10)
h = np.linspace(0.1, 5, 10)
i = np.linspace(0.1, 5, 10)
j = np.linspace(0.1, 5, 10)   
    
    
    
#each variable got a for loop, but it started crashing out at four for me at around 3 of the for statements
for k in range(10):
    for l in range(10):
        for m in range(10):
            for n in range(10):
                for p in range(10):
                    for q in range(10):
                        for r in range(10):
                            for s in range(10):
                                for t in range(10):
                                        y = df.price    
                                        df['year_exp'] = np.transpose(np.array(np.power(df.year,a[n])))
                                        df['age_exp'] = np.transpose(np.array(np.power(df.age,b[m])))
                                        df['home_size_exp'] = np.transpose(np.array(np.power(df.home_size,c[p])))
                                        df['b_to_b_exp'] = np.transpose(np.array(np.power(df.b_to_b,j[n])))
                                        df['per_sheltered_exp'] = np.transpose(np.array(np.power(df.per_sheltered,d[m])))
                                        df['pool_exp'] = np.transpose(np.array(np.power(df.pool,e[p])))
                                        df['dist_lakes_exp'] = np.transpose(np.array(np.power(df.dist_lakes,f[n])))
                                        df['dist_cbd_exp'] = np.transpose(np.array(np.power(df.dist_cbd,g[m])))
                                        df['x_coord_exp'] = np.transpose(np.array(np.power(df.x_coord,h[p])))
                                        df['y_coord_exp'] = np.transpose(np.array(np.power(df.y_coord,i[n])))
                                    
                                        x = np.transpose(np.array([df.year, df.year_exp, df.age, df.age_exp, df.home_size, df.home_size_exp, df.b_to_b, df.b_to_b_exp, df.per_sheltered, df.per_sheltered_exp, df.pool, df.pool_exp, df.dist_lakes, df.dist_lakes_exp, df.dist_cbd, df.dist_cbd_exp, df.x_coord, df.x_coord_exp, df.y_coord, df.y_coord_exp]))
                                        x = pd.DataFrame(x)
                                        model = LinearRegression()
                                        cv_scores = cross_validate(model, x, y, cv=10, scoring=('neg_mean_squared_error'))
                                        mse[str(a[k]),str(b[l]),str(c[m]),str(d[n]),str(e[p]),str(f[q]),str(g[r]),str(h[s]),str(i[t]), str(j[n])] = np.mean(cv_scores['test_score'])


print("Outcomes from the Best Linear Regression Model:")
min_mse = abs(max(mse.values()))
print("Minimum Average Test MSE:", min_mse.round(3))
for exponents, i in mse.items():
    if i == -min_mse:
        print("The Associated Exponent Values:", exponents)
        
#overall it could have led somewhere but ultimately didn't


'''Code to run for presentation'''
#left us with two main formulas to test that would likely be efficient

import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm
import numpy as np

#read in the og dataframe and test variables
df = pd.read_csv("/Users/jafrankhan/Desktop/Document/My MSA/QMB 6358/mid_term_dataset.csv", delimiter= ",")
df['B_to_B'] = df.beds/df.baths
df['per_sheltered'] = df.home_size/df.parcel_size
df['year_x_home_size'] = df.year*df.home_size
df['parcel_size_x_per_sheltered'] = df.parcel_size *df.per_sheltered
df['price'] = df['price']/1000


#this was the best one for the polynomial one
x = df[['year', 'age', 'home_size', 'dist_cbd', 'dist_lakes', 'x_coord', 'y_coord', 'B_to_B', 'per_sheltered']]
#this was the best with the interactions
x = df[['age', 'home_size', 'dist_cbd', 'dist_lakes', 'x_coord', 'y_coord', 'B_to_B', 'per_sheltered', 'year_x_home_size', 'parcel_size_x_per_sheltered']]
#run the poly features at 3 and transform it
poly = PolynomialFeatures(3)
poly_x = poly.fit_transform(x)

# Note that no variable names are attached.  While these are not
# necessary to estimate the model, they may be useful to see.
# Let's add them to the dataframe.

x = pd.DataFrame(poly.fit_transform(x), columns=poly.get_feature_names_out(x.columns))
y = df['price']

best_model = sm.OLS(y, x)
results = best_model.fit()
print(results.summary())

residuals = results.resid

# Compute Mean Squared Error (MSE)
mse = np.mean(residuals**2)

print("Mean Squared Error (MSE):", mse)


#for validation
val_set = pd.read_csv("/Users/jafrankhan/Desktop/Document/My MSA/QMB 6358/mid_term_validation_set-7.csv", delimiter= ",")
val_set['B_to_B'] = val_set.beds/val_set.baths
val_set['per_sheltered'] = val_set.home_size/val_set.parcel_size
val_set['year_x_home_size'] = val_set.year*val_set.home_size
val_set['parcel_size_x_per_sheltered'] = val_set.parcel_size *val_set.per_sheltered
val_set['price'] = val_set['price']/1000

#Test for the first one
x_val = np.array([val_set.year, val_set.age, val_set.home_size, val_set.dist_cbd, val_set.dist_lakes, val_set.x_coord, val_set.y_coord, val_set.B_to_B, val_set.per_sheltered]).T
#Test for the second one
x_val = np.array([val_set.age, val_set.home_size, val_set.dist_cbd, val_set.dist_lakes, val_set.x_coord, val_set.y_coord, val_set.B_to_B, val_set.per_sheltered, val_set.year_x_home_size, val_set.parcel_size_x_per_sheltered]).T


poly_val = PolynomialFeatures(3)
poly_x_val = poly.transform(x_val)

val_predictions = results.predict(poly_x_val)

final_mse = sum((val_set.price-val_predictions)**2)/len(val_set)

round(final_mse,2)