Fertility rates data visualization¶

In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

# Define a grayscale colormap
grayscale = LinearSegmentedColormap.from_list(
    'Grayscale', ['#f0f0f0', '#d0d0d0', '#b0b0b0', '#808080', '#000000']
)

# Load the data
df = pd.read_excel('/content/drive/MyDrive/Tadeo/MiTesis/Articulos/SupplementaryMaterial/Supplement_ExcelData_03092024.xlsx', sheet_name='Merge1')

# Total Fertility Rate (TFR)
df['tasa_fecundidad_global'] = (
    (df['Maternidad 15 a 19 años'] / df['Afiliadas 15 a 19 años']) +
    (df['Maternidad 20 a 24 años'] / df['Afiliadas 20 a 24 años']) +
    (df['Maternidad 25 a 29 años'] / df['Afiliadas 25 a 29 años']) +
    (df['Maternidad 30 a 34 años'] / df['Afiliadas 30 a 34 años']) +
    (df['Maternidad 35 a 39 años'] / df['Afiliadas 35 a 39 años']) +
    (df['Maternidad 40 a 44 años'] / df['Afiliadas 40 a 44 años']) +
    (df['Maternidad 45 a 49 años'] / df['Afiliadas 45 a 49 años'])
) * 5

# Infant Fertility Rate (10-14 years)
df['tasa_fertilidad_infantil'] = (df['Maternidad infantil'] / df['Afiliadas 10 a 14 años']) * 1000

# Adolescent Fertility Rate (15-19 years)
df['tasa_fertilidad_adolescente'] = (df['Maternidad 15 a 19 años'] / df['Afiliadas 15 a 19 años']) * 1000

# Early Fertility Rate (<19 years)
df['tasa_fertilidad_Precoz'] = (
    (df['Maternidad infantil'] + df['Maternidad 15 a 19 años']) /
    (df['Afiliadas 10 a 14 años'] + df['Afiliadas 15 a 19 años']) * 1000
)

# Function to plot heatmaps
def plot_heatmap(data, title, cbar_label):
    plt.figure(figsize=(14, 10))
    sns.heatmap(
        data,
        annot=True,
        cmap=grayscale,
        fmt='.2f',
        linewidths=0.5,
        linecolor='black',
        cbar_kws={'label': cbar_label}
    )
    plt.title(title, fontsize=16)
    plt.ylabel('Department', fontsize=12)
    plt.xlabel('Year', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

#======================= Infant Fertility ============================================================
# Create heatmap for infant fertility rate
pivot_table_infantil = df.pivot_table(
    values='tasa_fertilidad_infantil',
    index='Departamento',
    columns='Año',
    aggfunc='mean'
)

# Add a total row for average infant fertility rate across the period
total_row_infantil = pd.DataFrame(pivot_table_infantil.mean(axis=0)).T
total_row_infantil.index = ['Total']
pivot_table_infantil = pd.concat([total_row_infantil, pivot_table_infantil])

# Sort by the average infant fertility rate in the period, highest to lowest
sorted_departments_infantil = pivot_table_infantil.iloc[1:].mean(axis=1).sort_values(ascending=False).index
pivot_table_infantil = pivot_table_infantil.loc[['Total'] + list(sorted_departments_infantil)]

# Draw heatmap for infant fertility rate
plot_heatmap(pivot_table_infantil, 'Infant Fertility Rate', '1,000 women (10-14 years)')

#======================= Adolescent Fertility =======================================================
# Create heatmap for adolescent fertility rate
pivot_table_adolescente = df.pivot_table(
    values='tasa_fertilidad_adolescente',
    index='Departamento',
    columns='Año',
    aggfunc='mean'
)

# Add a total row for average adolescent fertility rate across the period
total_row_adolescente = pd.DataFrame(pivot_table_adolescente.mean(axis=0)).T
total_row_adolescente.index = ['Total']
pivot_table_adolescente = pd.concat([total_row_adolescente, pivot_table_adolescente])

# Sort by the average adolescent fertility rate in the period, highest to lowest
sorted_departments_adolescente = pivot_table_adolescente.iloc[1:].mean(axis=1).sort_values(ascending=False).index
pivot_table_adolescente = pivot_table_adolescente.loc[['Total'] + list(sorted_departments_adolescente)]

# Draw heatmap for adolescent fertility rate
plot_heatmap(pivot_table_adolescente, 'Adolescent Fertility Rate', '1,000 women (15-19 years)')

#======================= Early Fertility =============================================================
# Create heatmap for early fertility rate (infant + adolescent)
pivot_table_precoz = df.pivot_table(
    values='tasa_fertilidad_Precoz',
    index='Departamento',
    columns='Año',
    aggfunc='mean'
)

# Add a total row for average early fertility rate across the period
total_row_precoz = pd.DataFrame(pivot_table_precoz.mean(axis=0)).T
total_row_precoz.index = ['Total']
pivot_table_precoz = pd.concat([total_row_precoz, pivot_table_precoz])

# Sort by the average early fertility rate in the period, highest to lowest
sorted_departments_precoz = pivot_table_precoz.iloc[1:].mean(axis=1).sort_values(ascending=False).index
pivot_table_precoz = pivot_table_precoz.loc[['Total'] + list(sorted_departments_precoz)]

# Draw heatmap for early fertility rate
plot_heatmap(pivot_table_precoz, 'Early Fertility Rate', '1,000 women (10-19 years)')

#======================= Total Fertility =============================================================
# Create heatmap for total fertility rate (TFR)
pivot_table_general = df.pivot_table(
    values='tasa_fecundidad_global',
    index='Departamento',
    columns='Año',
    aggfunc='mean'
)

# Add a total row for average total fertility rate across the period
total_row_general = pd.DataFrame(pivot_table_general.mean(axis=0)).T
total_row_general.index = ['Total']
pivot_table_general = pd.concat([total_row_general, pivot_table_general])

# Sort by the average total fertility rate in the period, highest to lowest
sorted_departments_general = pivot_table_general.iloc[1:].mean(axis=1).sort_values(ascending=False).index
pivot_table_general = pivot_table_general.loc[['Total'] + list(sorted_departments_general)]

# Draw heatmap for total fertility rate
plot_heatmap(pivot_table_general, 'Total Fertility Rate', 'Births per woman during fertility period (15-49 years)')
In [4]:
# Ensure all departments are present in each pivot table
departments_union = (
    set(pivot_table_precoz.index) |
    set(pivot_table_infantil.index) |
    set(pivot_table_adolescente.index) |
    set(pivot_table_general.index)
)

# Create DataFrames filling missing departments with NaN
precoz_totals = pivot_table_precoz.reindex(departments_union).mean(axis=1)
infantil_totals = pivot_table_infantil.reindex(departments_union).mean(axis=1)
adolescente_totals = pivot_table_adolescente.reindex(departments_union).mean(axis=1)
general_totals = pivot_table_general.reindex(departments_union).mean(axis=1)

# Create a DataFrame with the totals of the rates per department
totals_df = pd.DataFrame({
    'Early Fertility': precoz_totals,
    'Infant Fertility': infantil_totals,
    'Adolescent Fertility': adolescente_totals,
    'Total Fertility': general_totals
})

# Sort departments from highest to lowest based on early fertility rate
sorted_departments_precoz = totals_df['Early Fertility'].sort_values(ascending=False).index
totals_df = totals_df.loc[sorted_departments_precoz]

# Create the heatmap with totals of rates per department, sorted by early fertility
plt.figure(figsize=(12, 8))
sns.heatmap(
    totals_df,
    annot=True,
    cmap=grayscale,  # Use the same grayscale color map
    fmt='.2f',
    linewidths=0.5,
    linecolor='black',
    cbar_kws={'label': 'Mean Rate'}
)
plt.title('Period Totals of Rates by Department (Sorted by Early Fertility)', fontsize=16)
plt.ylabel('Department', fontsize=12)
plt.xlabel('Rates', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

Model Preparation¶

In [6]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from scipy import stats

# Load the data
df = pd.read_excel('/content/drive/MyDrive/Tadeo/MiTesis/Articulos/SupplementaryMaterial/Supplement_ExcelData_03092024.xlsx', sheet_name='Merge1')

# 1. Early Fertility Rate
df['tasa_fertil_precoz'] = (df['Maternidad infantil'] + df['Maternidad 15 a 19 años']) / ((df['Afiliadas 10 a 14 años'] + df['Afiliadas 15 a 19 años']) * 1000)

# 2. Adolescent Fertility Rate
df['tasa_fertilidad_adolescente'] = (df['Maternidad 15 a 19 años'] / df['Afiliadas 15 a 19 años']) * 1000

# 3. Infant Fertility Rate
df['tasa_fertilidad_infantil'] = (df['Maternidad infantil'] / df['Afiliadas 10 a 14 años']) * 1000

# 4. Total Fertility Rate (TFR)
df['tasa_fecundidad_total'] = (
    (df['Maternidad 15 a 19 años'] / df['Afiliadas 15 a 19 años']) +
    (df['Maternidad 20 a 24 años'] / df['Afiliadas 20 a 24 años']) +
    (df['Maternidad 25 a 29 años'] / df['Afiliadas 25 a 29 años']) +
    (df['Maternidad 30 a 34 años'] / df['Afiliadas 30 a 34 años']) +
    (df['Maternidad 35 a 39 años'] / df['Afiliadas 35 a 39 años']) +
    (df['Maternidad 40 a 44 años'] / df['Afiliadas 40 a 44 años']) +
    (df['Maternidad 45 a 49 años'] / df['Afiliadas 45 a 49 años'])
) * 5

# 5. Percentage of Population Enrolled in School
df['porcentaje_poblacion_matriculada'] = (df['Población matriculada en colegio'] / df['Afiliados en edad escolar']) * 100

# 6. Percentage of Female Deaths
df['porcentaje_muertes_femeninas'] = (df['Defunciones femenino'] / df['Total defunciones']) * 100

# 7. Percentage of Deaths from External Causes
df['porcentaje_muertes_por_causas_externas'] = (df['Defunciones causa externa'] / df['Total defunciones']) * 100

# 8. Percentage of Female Deaths from External Causes
df['porcentaje_muertes_femeninas_por_causas_externas'] = (df['Defunciones causa externa femenino'] / df['Defunciones femenino']) * 100

# 9. Percentage of Infant Deaths
df['porcentaje_muertes_infantiles'] = (df['Defunciones infantiles'] / df['Total defunciones']) * 100

# 10. Labor Force Participation Rate
df['tasa_participacion_fuerza_laboral'] = df['Tasa de ocupación (TO)']

# 11. Percentage of Population in Subsidized Regime
df['porcentaje_poblacion_subsidiada'] = (df['Afiliados subsidiado'] / df['Total afiliados']) * 100

# 12. GDP per Capita
df['pib_per_capita'] = df['PIB per Cápita']

# 13. Percentage of Indigenous Population
df['% Población indigena'] = df['% Población indigena']

# Create the new dataset with calculated variables
nuevo_df = df[[
    'tasa_fertil_precoz',
    'tasa_fertilidad_infantil',
    'tasa_fertilidad_adolescente',
    'tasa_fecundidad_total',
    'porcentaje_poblacion_matriculada',
    'porcentaje_muertes_femeninas',
    'porcentaje_muertes_por_causas_externas',
    'porcentaje_muertes_femeninas_por_causas_externas',
    'porcentaje_muertes_infantiles',
    'tasa_participacion_fuerza_laboral',
    'porcentaje_poblacion_subsidiada',
    'pib_per_capita',
    '% Población indigena',
    'Departamento',
    'Año'
]]

## Outlier Removal ==========================================================================================

# Calculate Z-scores
z_scores = stats.zscore(nuevo_df[['tasa_fertil_precoz',
                                  'porcentaje_poblacion_matriculada',
                                  'porcentaje_muertes_femeninas',
                                  'porcentaje_muertes_por_causas_externas',
                                  'porcentaje_muertes_femeninas_por_causas_externas',
                                  'porcentaje_muertes_infantiles',
                                  'tasa_participacion_fuerza_laboral',
                                  'porcentaje_poblacion_subsidiada',
                                  'pib_per_capita',
                                  '% Población indigena',
                                  'tasa_fertilidad_infantil',
                                  'tasa_fertilidad_adolescente',
                                  'tasa_fecundidad_total']])
df['z_score'] = abs(z_scores).max(axis=1)

# Define a threshold for Z-scores (commonly 3)
umbral = 3
df_limpio = df[df['z_score'] <= umbral]

# Drop the 'z_score' column after cleaning
df_limpio = df_limpio.drop(columns=['z_score'])

# Use the cleaned dataset for further analysis
nuevo_df_limpio = df_limpio[[
    'tasa_fertil_precoz',
    'porcentaje_poblacion_matriculada',
    'porcentaje_muertes_femeninas',
    'porcentaje_muertes_por_causas_externas',
    'porcentaje_muertes_femeninas_por_causas_externas',
    'porcentaje_muertes_infantiles',
    'tasa_participacion_fuerza_laboral',
    'porcentaje_poblacion_subsidiada',
    'pib_per_capita',
    '% Población indigena',
    'tasa_fertilidad_infantil',
    'tasa_fertilidad_adolescente',
    'tasa_fecundidad_total'
]]

Visualize scatterplots¶

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#====================visualizing correlations
# Create a 5x2 grid for scatter plots
fig, axes = plt.subplots(6, 2, figsize=(14, 22))  # Increased the number of rows
fig.subplots_adjust(hspace=0.4)

# Define a list of variables and corresponding titles in Spanish
variables = [
    ('tasa_fertilidad_infantil', 'Infant Fertility Rate'),
    ('tasa_fertilidad_adolescente', 'Adolescent Fertility Rate'),
    ('tasa_fecundidad_total', 'General Fertility Rate'),
    ('porcentaje_poblacion_subsidiada', '% Subsidized Population'),
    ('tasa_participacion_fuerza_laboral', '% Labor Force Participation'),
    ('% Población indigena', '% Indigenous Population'),
    ('porcentaje_poblacion_matriculada', '% Enrolled School-Aged Population'),
    ('porcentaje_muertes_femeninas', '% Female Deaths of Total Deaths'),
    ('porcentaje_muertes_por_causas_externas', '% Violent Deaths of Total Deaths'),
    ('porcentaje_muertes_femeninas_por_causas_externas', '% Female Violent Deaths of Total Female Deaths'),
    ('porcentaje_muertes_infantiles', '% Infant Deaths of Total Deaths'),
    ('pib_per_capita', 'GDP per Capita')
]

# Iterate over each variable and create a scatter plot with a trend line
for i, (variable, titulo) in enumerate(variables):
    fila = i // 2
    col = i % 2
    sns.regplot(x=df[variable], y=df['tasa_fertil_precoz'], ax=axes[fila, col],
                scatter_kws={'color': 'blue', 's': 50}, line_kws={'color': 'red'})
    axes[fila, col].set_title(titulo)
    axes[fila, col].set_xlabel('')
    axes[fila, col].set_ylabel('Early Fertility Rate')

# Set the main title and display the plot
fig.suptitle('Scatter Plots with Trend Lines Comparing Variables with Early Fertility Rate', fontsize=16)
plt.show()

Generalized Linear Model (GLM) Analysis¶

In [46]:
pip install statsmodels
Requirement already satisfied: statsmodels in /usr/local/lib/python3.10/dist-packages (0.14.4)
Requirement already satisfied: numpy<3,>=1.22.3 in /usr/local/lib/python3.10/dist-packages (from statsmodels) (1.26.4)
Requirement already satisfied: scipy!=1.9.2,>=1.8 in /usr/local/lib/python3.10/dist-packages (from statsmodels) (1.13.1)
Requirement already satisfied: pandas!=2.1.0,>=1.4 in /usr/local/lib/python3.10/dist-packages (from statsmodels) (2.2.2)
Requirement already satisfied: patsy>=0.5.6 in /usr/local/lib/python3.10/dist-packages (from statsmodels) (0.5.6)
Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.10/dist-packages (from statsmodels) (24.1)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2024.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2024.2)
Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from patsy>=0.5.6->statsmodels) (1.16.0)
In [11]:
import pandas as pd
import statsmodels.api as sm
from sklearn import metrics  # Make sure to import the metrics module for MSE and R²

# Remove rows with null values (if any)
nuevo_df_limpio = df.dropna()

# Define the dependent variable and independent variables
X = nuevo_df_limpio[['porcentaje_poblacion_matriculada',
                     'porcentaje_muertes_femeninas',
                     'porcentaje_muertes_por_causas_externas',
                     'porcentaje_muertes_femeninas_por_causas_externas',
                     'porcentaje_muertes_infantiles',
                     'tasa_participacion_fuerza_laboral',
                     'porcentaje_poblacion_subsidiada',
                     'pib_per_capita',
                     '% Población indigena',
                     # 'tasa_fertilidad_infantil',
                     # 'tasa_fertilidad_adolescente',
                     # 'tasa_fecundidad_total'
                     ]]
y = nuevo_df_limpio['tasa_fertil_precoz']

# Add a constant for the intercept
X = sm.add_constant(X)

# Fit the GLM model
modelo_glm = sm.GLM(y, X, family=sm.families.Gaussian())
resultado = modelo_glm.fit()

# Print the summary of the model
print(resultado.summary())

# Make predictions
y_pred = resultado.predict(X)

# Evaluate the model using MSE and R²
mse_glm = metrics.mean_squared_error(y, y_pred)
r2_glm = metrics.r2_score(y, y_pred)

print(f'MSE of the GLM model: {mse_glm:.2f}')
print(f'R² of the GLM model: {r2_glm:.2f}')
                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:     tasa_fertil_precoz   No. Observations:                  138
Model:                            GLM   Df Residuals:                      128
Model Family:                Gaussian   Df Model:                            9
Link Function:               Identity   Scale:                      3.2045e-11
Method:                          IRLS   Log-Likelihood:                 1476.7
Date:                Wed, 16 Oct 2024   Deviance:                   4.1017e-09
Time:                        15:24:59   Pearson chi2:                 4.10e-09
No. Iterations:                     3   Pseudo R-squ. (CS):             0.7554
Covariance Type:            nonrobust                                         
====================================================================================================================
                                                       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------------
const                                             8.716e-05   1.93e-05      4.521      0.000    4.94e-05       0.000
porcentaje_poblacion_matriculada                  3.863e-08   9.43e-08      0.410      0.682   -1.46e-07    2.23e-07
porcentaje_muertes_femeninas                     -1.466e-06   3.36e-07     -4.359      0.000   -2.12e-06   -8.07e-07
porcentaje_muertes_por_causas_externas           -7.318e-07   2.94e-07     -2.490      0.013   -1.31e-06   -1.56e-07
porcentaje_muertes_femeninas_por_causas_externas -1.946e-07   8.65e-07     -0.225      0.822   -1.89e-06     1.5e-06
porcentaje_muertes_infantiles                     1.172e-06      2e-07      5.867      0.000    7.81e-07    1.56e-06
tasa_participacion_fuerza_laboral                  1.35e-05   8.45e-06      1.598      0.110   -3.06e-06    3.01e-05
porcentaje_poblacion_subsidiada                   1.361e-07   5.73e-08      2.376      0.017    2.38e-08    2.48e-07
pib_per_capita                                   -3.114e-13   7.92e-14     -3.934      0.000   -4.67e-13   -1.56e-13
% Población indigena                             -1.263e-05   7.03e-06     -1.796      0.073   -2.64e-05    1.15e-06
====================================================================================================================
MSE of the GLM model: 0.00
R² of the GLM model: 0.60

Re-run the GLM Model¶

In [12]:
import pandas as pd
import statsmodels.api as sm


X = nuevo_df_limpio[[#'porcentaje_poblacion_matriculada',
                     'porcentaje_muertes_femeninas',
                     'porcentaje_muertes_por_causas_externas',
                     #'porcentaje_muertes_femeninas_por_causas_externas',
                     'porcentaje_muertes_infantiles',
                     #'tasa_participacion_fuerza_laboral',
                     'porcentaje_poblacion_subsidiada',
                     'pib_per_capita',
                     #'% Población indigena',
                     #'tasa_fertilidad_infantil',
                     #'tasa_fertilidad_adolescente',
                     #'tasa_fecundidad_total'
                     ]]
y = nuevo_df_limpio['tasa_fertil_precoz']

# Add a constant for the intercept
X = sm.add_constant(X)

# Fit the GLM model
modelo_glm = sm.GLM(y, X, family=sm.families.Gaussian())
resultado = modelo_glm.fit()

# Print the summary of the model
print(resultado.summary())

# Make predictions
y_pred = resultado.predict(X)

# Evaluate the model using MSE and R²
mse_glm = metrics.mean_squared_error(y, y_pred)
r2_glm = metrics.r2_score(y, y_pred)

print(f'MSE of the GLM model: {mse_glm:.2f}')
print(f'R² of the GLM model: {r2_glm:.2f}')
                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:     tasa_fertil_precoz   No. Observations:                  138
Model:                            GLM   Df Residuals:                      132
Model Family:                Gaussian   Df Model:                            5
Link Function:               Identity   Scale:                      3.2280e-11
Method:                          IRLS   Log-Likelihood:                 1474.1
Date:                Wed, 16 Oct 2024   Deviance:                   4.2610e-09
Time:                        15:25:58   Pearson chi2:                 4.26e-09
No. Iterations:                     3   Pseudo R-squ. (CS):             0.7435
Covariance Type:            nonrobust                                         
==========================================================================================================
                                             coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------------
const                                   8.619e-05   1.44e-05      5.978      0.000    5.79e-05       0.000
porcentaje_muertes_femeninas           -1.207e-06   2.88e-07     -4.189      0.000   -1.77e-06   -6.42e-07
porcentaje_muertes_por_causas_externas -7.436e-07   1.45e-07     -5.143      0.000   -1.03e-06    -4.6e-07
porcentaje_muertes_infantiles           9.999e-07   1.54e-07      6.512      0.000    6.99e-07     1.3e-06
porcentaje_poblacion_subsidiada         1.308e-07   4.98e-08      2.628      0.009    3.32e-08    2.28e-07
pib_per_capita                          -3.04e-13   7.56e-14     -4.023      0.000   -4.52e-13   -1.56e-13
==========================================================================================================
MSE of the GLM model: 0.00
R² of the GLM model: 0.59
In [13]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt

# Define the model with sklearn
sklearn_model = LinearRegression()

# Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(sklearn_model, X, y, cv=kf, scoring='r2')

print(f'Average R² in cross-validation: {np.mean(scores):.4f}')

# Train the sklearn model to make predictions
sklearn_model.fit(X, y)
y_pred = sklearn_model.predict(X)

# Residuals plot and Residuals Histogram
residuals = y - y_pred

# Create a figure with two plots side by side
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Residuals scatter plot
axes[0].scatter(y_pred, residuals)
axes[0].axhline(y=0, color='r', linestyle='--')
axes[0].set_title("Residuals Plot")
axes[0].set_xlabel("Predicted Values")
axes[0].set_ylabel("Residuals")

# Residuals histogram
axes[1].hist(residuals, bins=30)
axes[1].set_title("Residuals Histogram")
axes[1].set_xlabel("Residuals")
axes[1].set_ylabel("Frequency")

# Show the figure
plt.tight_layout()
plt.show()
Average R² in cross-validation: 0.5188