# -*- coding: utf-8 -*- """project ml.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1ScYdmUla5mT7Zrevk6Lk9LVw2kmRfChM """ import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from imblearn.over_sampling import SMOTE from xgboost import XGBClassifier from sklearn.neural_network import MLPClassifier from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier df=pd.read_csv("healthcare_stroke_dataset.csv") df.head() df.info() df.shape df.describe() df.isnull().sum() df['bmi'].fillna(df['bmi'].mean(), inplace=True) df.isnull().sum() df.describe() plt.figure(figsize=(12,8)) a = ['age', 'avg_glucose_level', 'bmi'] for i, c in enumerate(a): plt.subplot(2, 2, i+1) sns.histplot(df[c], kde=True, bins=30) plt.title(f'Distribution of {c}') plt.tight_layout() plt.show() plt.figure(figsize=(12,6)) s = ['age', 'avg_glucose_level', 'bmi'] for i, o in enumerate(s): plt.subplot(1, 3, i+1) sns.boxplot(y=df[o]) plt.title(f'Boxplot of {o}') plt.tight_layout() plt.show() if 'id' in df.columns: df = df.drop('id', axis=1) nu= df.select_dtypes(include=np.number) corr = nu.corr() plt.figure(figsize=(10,6)) sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f") plt.title('Correlation Heatmap') plt.show() dfe = pd.get_dummies(df, drop_first=True) dfe.head() df = df.drop('date', axis=1) df_encoded = pd.get_dummies(df, drop_first=True) df_encoded.head() y = df_encoded['stroke'] X = df_encoded.drop('stroke', axis=1) # تقسيم البيانات: 80% تدريب و20% اختبار X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print("Configure training data:", X_train.shape) print("Test data format:", X_test.shape) smote = SMOTE(random_state=42) X_train_res, y_train_res = smote.fit_resample(X_train, y_train) print("📊 Before budget:") print(y_train.value_counts()) print("\n📊 After budget:") print(y_train_res.value_counts()) model_lr = LogisticRegression(class_weight='balanced', max_iter=3000) model_rf = RandomForestClassifier(class_weight='balanced', random_state=42) model_xgb = XGBClassifier(scale_pos_weight=(len(y_train_res) - sum(y_train_res)) / sum(y_train_res)) model_knn = KNeighborsClassifier(n_neighbors=5) model_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42) model_svc = SVC(class_weight='balanced', probability=True, random_state=42) from sklearn.model_selection import cross_val_score models = { "Logistic Regression": model_lr, "Random Forest": model_rf, "XGBoost": model_xgb, "KNN": model_knn, "MLP": model_mlp, "SVC": model_svc } results = {} cv_folds = 5 for name, model in models.items(): print("="*60) print(f"🤖 Training Model with Cross Validation: {name}") print("="*60) cv_scores = cross_val_score(model, X_train_res, y_train_res, cv=cv_folds, scoring='f1') print(f"Cross Validation F1 Scores: {cv_scores}") print(f"Average CV F1 Score: {cv_scores.mean():.3f}") model.fit(X_train_res, y_train_res) y_pred = model.predict(X_test) acc = accuracy_score(y_test, y_pred) report = classification_report(y_test, y_pred, digits=3, output_dict=True) conf = confusion_matrix(y_test, y_pred) results[name] = { "Accuracy": acc, "Precision_1": report["1"]["precision"], "Recall_1": report["1"]["recall"], "F1_1": report["1"]["f1-score"], "CV_F1_Avg": cv_scores.mean() } print(f"Accuracy: {acc:.3f}") print(f"Precision (Class 1): {report['1']['precision']:.3f}") print(f"Recall (Class 1): {report['1']['recall']:.3f}") print(f"F1-score (Class 1): {report['1']['f1-score']:.3f}") print(f"Confusion Matrix:\n{conf}") print("-"*60, "\n") results_df = pd.DataFrame(results).T print("\n📊 Model Comparison (After Cross Validation):") display(results_df)