import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
# Define a grayscale colormap
grayscale = LinearSegmentedColormap.from_list(
'Grayscale', ['#f0f0f0', '#d0d0d0', '#b0b0b0', '#808080', '#000000']
)
# Load the data
df = pd.read_excel('/content/drive/MyDrive/Tadeo/MiTesis/Articulos/SupplementaryMaterial/Supplement_ExcelData_03092024.xlsx', sheet_name='Merge1')
# Total Fertility Rate (TFR)
df['tasa_fecundidad_global'] = (
(df['Maternidad 15 a 19 años'] / df['Afiliadas 15 a 19 años']) +
(df['Maternidad 20 a 24 años'] / df['Afiliadas 20 a 24 años']) +
(df['Maternidad 25 a 29 años'] / df['Afiliadas 25 a 29 años']) +
(df['Maternidad 30 a 34 años'] / df['Afiliadas 30 a 34 años']) +
(df['Maternidad 35 a 39 años'] / df['Afiliadas 35 a 39 años']) +
(df['Maternidad 40 a 44 años'] / df['Afiliadas 40 a 44 años']) +
(df['Maternidad 45 a 49 años'] / df['Afiliadas 45 a 49 años'])
) * 5
# Infant Fertility Rate (10-14 years)
df['tasa_fertilidad_infantil'] = (df['Maternidad infantil'] / df['Afiliadas 10 a 14 años']) * 1000
# Adolescent Fertility Rate (15-19 years)
df['tasa_fertilidad_adolescente'] = (df['Maternidad 15 a 19 años'] / df['Afiliadas 15 a 19 años']) * 1000
# Early Fertility Rate (<19 years)
df['tasa_fertilidad_Precoz'] = (
(df['Maternidad infantil'] + df['Maternidad 15 a 19 años']) /
(df['Afiliadas 10 a 14 años'] + df['Afiliadas 15 a 19 años']) * 1000
)
# Function to plot heatmaps
def plot_heatmap(data, title, cbar_label):
plt.figure(figsize=(14, 10))
sns.heatmap(
data,
annot=True,
cmap=grayscale,
fmt='.2f',
linewidths=0.5,
linecolor='black',
cbar_kws={'label': cbar_label}
)
plt.title(title, fontsize=16)
plt.ylabel('Department', fontsize=12)
plt.xlabel('Year', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
#======================= Infant Fertility ============================================================
# Create heatmap for infant fertility rate
pivot_table_infantil = df.pivot_table(
values='tasa_fertilidad_infantil',
index='Departamento',
columns='Año',
aggfunc='mean'
)
# Add a total row for average infant fertility rate across the period
total_row_infantil = pd.DataFrame(pivot_table_infantil.mean(axis=0)).T
total_row_infantil.index = ['Total']
pivot_table_infantil = pd.concat([total_row_infantil, pivot_table_infantil])
# Sort by the average infant fertility rate in the period, highest to lowest
sorted_departments_infantil = pivot_table_infantil.iloc[1:].mean(axis=1).sort_values(ascending=False).index
pivot_table_infantil = pivot_table_infantil.loc[['Total'] + list(sorted_departments_infantil)]
# Draw heatmap for infant fertility rate
plot_heatmap(pivot_table_infantil, 'Infant Fertility Rate', '1,000 women (10-14 years)')
#======================= Adolescent Fertility =======================================================
# Create heatmap for adolescent fertility rate
pivot_table_adolescente = df.pivot_table(
values='tasa_fertilidad_adolescente',
index='Departamento',
columns='Año',
aggfunc='mean'
)
# Add a total row for average adolescent fertility rate across the period
total_row_adolescente = pd.DataFrame(pivot_table_adolescente.mean(axis=0)).T
total_row_adolescente.index = ['Total']
pivot_table_adolescente = pd.concat([total_row_adolescente, pivot_table_adolescente])
# Sort by the average adolescent fertility rate in the period, highest to lowest
sorted_departments_adolescente = pivot_table_adolescente.iloc[1:].mean(axis=1).sort_values(ascending=False).index
pivot_table_adolescente = pivot_table_adolescente.loc[['Total'] + list(sorted_departments_adolescente)]
# Draw heatmap for adolescent fertility rate
plot_heatmap(pivot_table_adolescente, 'Adolescent Fertility Rate', '1,000 women (15-19 years)')
#======================= Early Fertility =============================================================
# Create heatmap for early fertility rate (infant + adolescent)
pivot_table_precoz = df.pivot_table(
values='tasa_fertilidad_Precoz',
index='Departamento',
columns='Año',
aggfunc='mean'
)
# Add a total row for average early fertility rate across the period
total_row_precoz = pd.DataFrame(pivot_table_precoz.mean(axis=0)).T
total_row_precoz.index = ['Total']
pivot_table_precoz = pd.concat([total_row_precoz, pivot_table_precoz])
# Sort by the average early fertility rate in the period, highest to lowest
sorted_departments_precoz = pivot_table_precoz.iloc[1:].mean(axis=1).sort_values(ascending=False).index
pivot_table_precoz = pivot_table_precoz.loc[['Total'] + list(sorted_departments_precoz)]
# Draw heatmap for early fertility rate
plot_heatmap(pivot_table_precoz, 'Early Fertility Rate', '1,000 women (10-19 years)')
#======================= Total Fertility =============================================================
# Create heatmap for total fertility rate (TFR)
pivot_table_general = df.pivot_table(
values='tasa_fecundidad_global',
index='Departamento',
columns='Año',
aggfunc='mean'
)
# Add a total row for average total fertility rate across the period
total_row_general = pd.DataFrame(pivot_table_general.mean(axis=0)).T
total_row_general.index = ['Total']
pivot_table_general = pd.concat([total_row_general, pivot_table_general])
# Sort by the average total fertility rate in the period, highest to lowest
sorted_departments_general = pivot_table_general.iloc[1:].mean(axis=1).sort_values(ascending=False).index
pivot_table_general = pivot_table_general.loc[['Total'] + list(sorted_departments_general)]
# Draw heatmap for total fertility rate
plot_heatmap(pivot_table_general, 'Total Fertility Rate', 'Births per woman during fertility period (15-49 years)')
# Ensure all departments are present in each pivot table
departments_union = (
set(pivot_table_precoz.index) |
set(pivot_table_infantil.index) |
set(pivot_table_adolescente.index) |
set(pivot_table_general.index)
)
# Create DataFrames filling missing departments with NaN
precoz_totals = pivot_table_precoz.reindex(departments_union).mean(axis=1)
infantil_totals = pivot_table_infantil.reindex(departments_union).mean(axis=1)
adolescente_totals = pivot_table_adolescente.reindex(departments_union).mean(axis=1)
general_totals = pivot_table_general.reindex(departments_union).mean(axis=1)
# Create a DataFrame with the totals of the rates per department
totals_df = pd.DataFrame({
'Early Fertility': precoz_totals,
'Infant Fertility': infantil_totals,
'Adolescent Fertility': adolescente_totals,
'Total Fertility': general_totals
})
# Sort departments from highest to lowest based on early fertility rate
sorted_departments_precoz = totals_df['Early Fertility'].sort_values(ascending=False).index
totals_df = totals_df.loc[sorted_departments_precoz]
# Create the heatmap with totals of rates per department, sorted by early fertility
plt.figure(figsize=(12, 8))
sns.heatmap(
totals_df,
annot=True,
cmap=grayscale, # Use the same grayscale color map
fmt='.2f',
linewidths=0.5,
linecolor='black',
cbar_kws={'label': 'Mean Rate'}
)
plt.title('Period Totals of Rates by Department (Sorted by Early Fertility)', fontsize=16)
plt.ylabel('Department', fontsize=12)
plt.xlabel('Rates', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from scipy import stats
# Load the data
df = pd.read_excel('/content/drive/MyDrive/Tadeo/MiTesis/Articulos/SupplementaryMaterial/Supplement_ExcelData_03092024.xlsx', sheet_name='Merge1')
# 1. Early Fertility Rate
df['tasa_fertil_precoz'] = (df['Maternidad infantil'] + df['Maternidad 15 a 19 años']) / ((df['Afiliadas 10 a 14 años'] + df['Afiliadas 15 a 19 años']) * 1000)
# 2. Adolescent Fertility Rate
df['tasa_fertilidad_adolescente'] = (df['Maternidad 15 a 19 años'] / df['Afiliadas 15 a 19 años']) * 1000
# 3. Infant Fertility Rate
df['tasa_fertilidad_infantil'] = (df['Maternidad infantil'] / df['Afiliadas 10 a 14 años']) * 1000
# 4. Total Fertility Rate (TFR)
df['tasa_fecundidad_total'] = (
(df['Maternidad 15 a 19 años'] / df['Afiliadas 15 a 19 años']) +
(df['Maternidad 20 a 24 años'] / df['Afiliadas 20 a 24 años']) +
(df['Maternidad 25 a 29 años'] / df['Afiliadas 25 a 29 años']) +
(df['Maternidad 30 a 34 años'] / df['Afiliadas 30 a 34 años']) +
(df['Maternidad 35 a 39 años'] / df['Afiliadas 35 a 39 años']) +
(df['Maternidad 40 a 44 años'] / df['Afiliadas 40 a 44 años']) +
(df['Maternidad 45 a 49 años'] / df['Afiliadas 45 a 49 años'])
) * 5
# 5. Percentage of Population Enrolled in School
df['porcentaje_poblacion_matriculada'] = (df['Población matriculada en colegio'] / df['Afiliados en edad escolar']) * 100
# 6. Percentage of Female Deaths
df['porcentaje_muertes_femeninas'] = (df['Defunciones femenino'] / df['Total defunciones']) * 100
# 7. Percentage of Deaths from External Causes
df['porcentaje_muertes_por_causas_externas'] = (df['Defunciones causa externa'] / df['Total defunciones']) * 100
# 8. Percentage of Female Deaths from External Causes
df['porcentaje_muertes_femeninas_por_causas_externas'] = (df['Defunciones causa externa femenino'] / df['Defunciones femenino']) * 100
# 9. Percentage of Infant Deaths
df['porcentaje_muertes_infantiles'] = (df['Defunciones infantiles'] / df['Total defunciones']) * 100
# 10. Labor Force Participation Rate
df['tasa_participacion_fuerza_laboral'] = df['Tasa de ocupación (TO)']
# 11. Percentage of Population in Subsidized Regime
df['porcentaje_poblacion_subsidiada'] = (df['Afiliados subsidiado'] / df['Total afiliados']) * 100
# 12. GDP per Capita
df['pib_per_capita'] = df['PIB per Cápita']
# 13. Percentage of Indigenous Population
df['% Población indigena'] = df['% Población indigena']
# Create the new dataset with calculated variables
nuevo_df = df[[
'tasa_fertil_precoz',
'tasa_fertilidad_infantil',
'tasa_fertilidad_adolescente',
'tasa_fecundidad_total',
'porcentaje_poblacion_matriculada',
'porcentaje_muertes_femeninas',
'porcentaje_muertes_por_causas_externas',
'porcentaje_muertes_femeninas_por_causas_externas',
'porcentaje_muertes_infantiles',
'tasa_participacion_fuerza_laboral',
'porcentaje_poblacion_subsidiada',
'pib_per_capita',
'% Población indigena',
'Departamento',
'Año'
]]
## Outlier Removal ==========================================================================================
# Calculate Z-scores
z_scores = stats.zscore(nuevo_df[['tasa_fertil_precoz',
'porcentaje_poblacion_matriculada',
'porcentaje_muertes_femeninas',
'porcentaje_muertes_por_causas_externas',
'porcentaje_muertes_femeninas_por_causas_externas',
'porcentaje_muertes_infantiles',
'tasa_participacion_fuerza_laboral',
'porcentaje_poblacion_subsidiada',
'pib_per_capita',
'% Población indigena',
'tasa_fertilidad_infantil',
'tasa_fertilidad_adolescente',
'tasa_fecundidad_total']])
df['z_score'] = abs(z_scores).max(axis=1)
# Define a threshold for Z-scores (commonly 3)
umbral = 3
df_limpio = df[df['z_score'] <= umbral]
# Drop the 'z_score' column after cleaning
df_limpio = df_limpio.drop(columns=['z_score'])
# Use the cleaned dataset for further analysis
nuevo_df_limpio = df_limpio[[
'tasa_fertil_precoz',
'porcentaje_poblacion_matriculada',
'porcentaje_muertes_femeninas',
'porcentaje_muertes_por_causas_externas',
'porcentaje_muertes_femeninas_por_causas_externas',
'porcentaje_muertes_infantiles',
'tasa_participacion_fuerza_laboral',
'porcentaje_poblacion_subsidiada',
'pib_per_capita',
'% Población indigena',
'tasa_fertilidad_infantil',
'tasa_fertilidad_adolescente',
'tasa_fecundidad_total'
]]
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#====================visualizing correlations
# Create a 5x2 grid for scatter plots
fig, axes = plt.subplots(6, 2, figsize=(14, 22)) # Increased the number of rows
fig.subplots_adjust(hspace=0.4)
# Define a list of variables and corresponding titles in Spanish
variables = [
('tasa_fertilidad_infantil', 'Infant Fertility Rate'),
('tasa_fertilidad_adolescente', 'Adolescent Fertility Rate'),
('tasa_fecundidad_total', 'General Fertility Rate'),
('porcentaje_poblacion_subsidiada', '% Subsidized Population'),
('tasa_participacion_fuerza_laboral', '% Labor Force Participation'),
('% Población indigena', '% Indigenous Population'),
('porcentaje_poblacion_matriculada', '% Enrolled School-Aged Population'),
('porcentaje_muertes_femeninas', '% Female Deaths of Total Deaths'),
('porcentaje_muertes_por_causas_externas', '% Violent Deaths of Total Deaths'),
('porcentaje_muertes_femeninas_por_causas_externas', '% Female Violent Deaths of Total Female Deaths'),
('porcentaje_muertes_infantiles', '% Infant Deaths of Total Deaths'),
('pib_per_capita', 'GDP per Capita')
]
# Iterate over each variable and create a scatter plot with a trend line
for i, (variable, titulo) in enumerate(variables):
fila = i // 2
col = i % 2
sns.regplot(x=df[variable], y=df['tasa_fertil_precoz'], ax=axes[fila, col],
scatter_kws={'color': 'blue', 's': 50}, line_kws={'color': 'red'})
axes[fila, col].set_title(titulo)
axes[fila, col].set_xlabel('')
axes[fila, col].set_ylabel('Early Fertility Rate')
# Set the main title and display the plot
fig.suptitle('Scatter Plots with Trend Lines Comparing Variables with Early Fertility Rate', fontsize=16)
plt.show()
pip install statsmodels
Requirement already satisfied: statsmodels in /usr/local/lib/python3.10/dist-packages (0.14.4) Requirement already satisfied: numpy<3,>=1.22.3 in /usr/local/lib/python3.10/dist-packages (from statsmodels) (1.26.4) Requirement already satisfied: scipy!=1.9.2,>=1.8 in /usr/local/lib/python3.10/dist-packages (from statsmodels) (1.13.1) Requirement already satisfied: pandas!=2.1.0,>=1.4 in /usr/local/lib/python3.10/dist-packages (from statsmodels) (2.2.2) Requirement already satisfied: patsy>=0.5.6 in /usr/local/lib/python3.10/dist-packages (from statsmodels) (0.5.6) Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.10/dist-packages (from statsmodels) (24.1) Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2024.2) Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas!=2.1.0,>=1.4->statsmodels) (2024.2) Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from patsy>=0.5.6->statsmodels) (1.16.0)
import pandas as pd
import statsmodels.api as sm
from sklearn import metrics # Make sure to import the metrics module for MSE and R²
# Remove rows with null values (if any)
nuevo_df_limpio = df.dropna()
# Define the dependent variable and independent variables
X = nuevo_df_limpio[['porcentaje_poblacion_matriculada',
'porcentaje_muertes_femeninas',
'porcentaje_muertes_por_causas_externas',
'porcentaje_muertes_femeninas_por_causas_externas',
'porcentaje_muertes_infantiles',
'tasa_participacion_fuerza_laboral',
'porcentaje_poblacion_subsidiada',
'pib_per_capita',
'% Población indigena',
# 'tasa_fertilidad_infantil',
# 'tasa_fertilidad_adolescente',
# 'tasa_fecundidad_total'
]]
y = nuevo_df_limpio['tasa_fertil_precoz']
# Add a constant for the intercept
X = sm.add_constant(X)
# Fit the GLM model
modelo_glm = sm.GLM(y, X, family=sm.families.Gaussian())
resultado = modelo_glm.fit()
# Print the summary of the model
print(resultado.summary())
# Make predictions
y_pred = resultado.predict(X)
# Evaluate the model using MSE and R²
mse_glm = metrics.mean_squared_error(y, y_pred)
r2_glm = metrics.r2_score(y, y_pred)
print(f'MSE of the GLM model: {mse_glm:.2f}')
print(f'R² of the GLM model: {r2_glm:.2f}')
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: tasa_fertil_precoz No. Observations: 138
Model: GLM Df Residuals: 128
Model Family: Gaussian Df Model: 9
Link Function: Identity Scale: 3.2045e-11
Method: IRLS Log-Likelihood: 1476.7
Date: Wed, 16 Oct 2024 Deviance: 4.1017e-09
Time: 15:24:59 Pearson chi2: 4.10e-09
No. Iterations: 3 Pseudo R-squ. (CS): 0.7554
Covariance Type: nonrobust
====================================================================================================================
coef std err z P>|z| [0.025 0.975]
--------------------------------------------------------------------------------------------------------------------
const 8.716e-05 1.93e-05 4.521 0.000 4.94e-05 0.000
porcentaje_poblacion_matriculada 3.863e-08 9.43e-08 0.410 0.682 -1.46e-07 2.23e-07
porcentaje_muertes_femeninas -1.466e-06 3.36e-07 -4.359 0.000 -2.12e-06 -8.07e-07
porcentaje_muertes_por_causas_externas -7.318e-07 2.94e-07 -2.490 0.013 -1.31e-06 -1.56e-07
porcentaje_muertes_femeninas_por_causas_externas -1.946e-07 8.65e-07 -0.225 0.822 -1.89e-06 1.5e-06
porcentaje_muertes_infantiles 1.172e-06 2e-07 5.867 0.000 7.81e-07 1.56e-06
tasa_participacion_fuerza_laboral 1.35e-05 8.45e-06 1.598 0.110 -3.06e-06 3.01e-05
porcentaje_poblacion_subsidiada 1.361e-07 5.73e-08 2.376 0.017 2.38e-08 2.48e-07
pib_per_capita -3.114e-13 7.92e-14 -3.934 0.000 -4.67e-13 -1.56e-13
% Población indigena -1.263e-05 7.03e-06 -1.796 0.073 -2.64e-05 1.15e-06
====================================================================================================================
MSE of the GLM model: 0.00
R² of the GLM model: 0.60
import pandas as pd
import statsmodels.api as sm
X = nuevo_df_limpio[[#'porcentaje_poblacion_matriculada',
'porcentaje_muertes_femeninas',
'porcentaje_muertes_por_causas_externas',
#'porcentaje_muertes_femeninas_por_causas_externas',
'porcentaje_muertes_infantiles',
#'tasa_participacion_fuerza_laboral',
'porcentaje_poblacion_subsidiada',
'pib_per_capita',
#'% Población indigena',
#'tasa_fertilidad_infantil',
#'tasa_fertilidad_adolescente',
#'tasa_fecundidad_total'
]]
y = nuevo_df_limpio['tasa_fertil_precoz']
# Add a constant for the intercept
X = sm.add_constant(X)
# Fit the GLM model
modelo_glm = sm.GLM(y, X, family=sm.families.Gaussian())
resultado = modelo_glm.fit()
# Print the summary of the model
print(resultado.summary())
# Make predictions
y_pred = resultado.predict(X)
# Evaluate the model using MSE and R²
mse_glm = metrics.mean_squared_error(y, y_pred)
r2_glm = metrics.r2_score(y, y_pred)
print(f'MSE of the GLM model: {mse_glm:.2f}')
print(f'R² of the GLM model: {r2_glm:.2f}')
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: tasa_fertil_precoz No. Observations: 138
Model: GLM Df Residuals: 132
Model Family: Gaussian Df Model: 5
Link Function: Identity Scale: 3.2280e-11
Method: IRLS Log-Likelihood: 1474.1
Date: Wed, 16 Oct 2024 Deviance: 4.2610e-09
Time: 15:25:58 Pearson chi2: 4.26e-09
No. Iterations: 3 Pseudo R-squ. (CS): 0.7435
Covariance Type: nonrobust
==========================================================================================================
coef std err z P>|z| [0.025 0.975]
----------------------------------------------------------------------------------------------------------
const 8.619e-05 1.44e-05 5.978 0.000 5.79e-05 0.000
porcentaje_muertes_femeninas -1.207e-06 2.88e-07 -4.189 0.000 -1.77e-06 -6.42e-07
porcentaje_muertes_por_causas_externas -7.436e-07 1.45e-07 -5.143 0.000 -1.03e-06 -4.6e-07
porcentaje_muertes_infantiles 9.999e-07 1.54e-07 6.512 0.000 6.99e-07 1.3e-06
porcentaje_poblacion_subsidiada 1.308e-07 4.98e-08 2.628 0.009 3.32e-08 2.28e-07
pib_per_capita -3.04e-13 7.56e-14 -4.023 0.000 -4.52e-13 -1.56e-13
==========================================================================================================
MSE of the GLM model: 0.00
R² of the GLM model: 0.59
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
# Define the model with sklearn
sklearn_model = LinearRegression()
# Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(sklearn_model, X, y, cv=kf, scoring='r2')
print(f'Average R² in cross-validation: {np.mean(scores):.4f}')
# Train the sklearn model to make predictions
sklearn_model.fit(X, y)
y_pred = sklearn_model.predict(X)
# Residuals plot and Residuals Histogram
residuals = y - y_pred
# Create a figure with two plots side by side
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
# Residuals scatter plot
axes[0].scatter(y_pred, residuals)
axes[0].axhline(y=0, color='r', linestyle='--')
axes[0].set_title("Residuals Plot")
axes[0].set_xlabel("Predicted Values")
axes[0].set_ylabel("Residuals")
# Residuals histogram
axes[1].hist(residuals, bins=30)
axes[1].set_title("Residuals Histogram")
axes[1].set_xlabel("Residuals")
axes[1].set_ylabel("Frequency")
# Show the figure
plt.tight_layout()
plt.show()
Average R² in cross-validation: 0.5188