EDA 03: sovracampionatura con dimensionalità ridotta
Librerie
In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# XGBClassifier
from xgboost import XGBClassifier
# LGBMClassifier
import lightgbm as lgb
# SVC
from sklearn.svm import SVC
Analisi dati
In [2]:
df = pd.read_parquet('./data/data.parquet')
In [3]:
df.drop(df.columns[52:91], axis=1, inplace=True)
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8759 entries, 0 to 8758 Data columns (total 53 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 V1 8759 non-null float64 1 V2 8759 non-null float64 2 V3 8759 non-null float64 3 V4 8759 non-null float64 4 V5 8759 non-null float64 5 V6 8759 non-null float64 6 V7 8759 non-null float64 7 V8 8759 non-null float64 8 V9 8759 non-null float64 9 V10 8759 non-null float64 10 V11 8759 non-null float64 11 V12 8759 non-null float64 12 V13 8759 non-null float64 13 V14 8759 non-null float64 14 V15 8759 non-null float64 15 V16 8759 non-null float64 16 V17 8759 non-null float64 17 V18 8759 non-null float64 18 V19 8759 non-null float64 19 V20 8759 non-null float64 20 V21 8759 non-null float64 21 V22 8759 non-null float64 22 V23 8759 non-null float64 23 V24 8759 non-null float64 24 V25 8759 non-null float64 25 V26 8759 non-null float64 26 V27 8759 non-null float64 27 V28 8759 non-null float64 28 V29 8759 non-null float64 29 V30 8759 non-null float64 30 V31 8759 non-null float64 31 V32 8759 non-null float64 32 V33 8759 non-null float64 33 V34 8759 non-null float64 34 V35 8759 non-null float64 35 V36 8759 non-null float64 36 V37 8759 non-null float64 37 V38 8759 non-null float64 38 V39 8759 non-null float64 39 V40 8759 non-null float64 40 V41 8759 non-null float64 41 V42 8759 non-null float64 42 V43 8759 non-null float64 43 V44 8759 non-null float64 44 V45 8759 non-null float64 45 V46 8759 non-null float64 46 V47 8759 non-null float64 47 V48 8759 non-null float64 48 V49 8759 non-null float64 49 V50 8759 non-null float64 50 V51 8759 non-null float64 51 V52 8759 non-null float64 52 Target 8759 non-null float64 dtypes: float64(53) memory usage: 3.5 MB
RandomForestClassifier
In [5]:
# Separazione delle feature dalla variabile target
X = df.drop(columns=['Target'])
y = df['Target']
In [6]:
# Sovracampionamento della classe minoritaria
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
In [7]:
# Suddivisione dei dati in training e test set
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
In [8]:
# Addestramento del modello (Random Forest)
model_rcf_rus = RandomForestClassifier(random_state=42)
model_rcf_rus.fit(X_train, y_train)
Out[8]:
RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=42)
In [9]:
# Predizioni
y_pred = model_rcf_rus.predict(X_test)
In [10]:
# Calcolo delle metriche precision, recall e F1-score
metrics_report = classification_report(y_test, y_pred, target_names=['Classe 0', 'Classe 1'])
In [11]:
print(metrics_report)
precision recall f1-score support Classe 0 0.99 0.97 0.98 2542 Classe 1 0.97 0.99 0.98 2469 accuracy 0.98 5011 macro avg 0.98 0.98 0.98 5011 weighted avg 0.98 0.98 0.98 5011
In [12]:
conf_matrix = confusion_matrix(y_test, y_pred)
# Plot della matrice di confusione
# Creazione della figura con dimensioni personalizzate
fig, ax = plt.subplots(figsize=(20, 10)) # Modifica le dimensioni qui
# Creazione e visualizzazione della matrice di confusione
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=model_rcf_rus.classes_)
disp.plot(cmap='Purples', values_format='d', ax=ax) # Passa l'asse (ax) alla funzione plot
# Titolo
plt.title('Confusion Matrix', fontsize=16)
#plt.savefig('./data/confusion_matrix_1.png')
plt.show()
XGBClassifier
In [13]:
# Separazione delle feature dalla variabile target
X = df.drop(columns=['Target'])
y = df['Target']
In [14]:
# Sovracampionamento della classe minoritaria
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
In [15]:
# Suddivisione dei dati in training e test set
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
In [16]:
model_XGBC = XGBClassifier(random_state=42)
model_XGBC.fit(X_train, y_train)
Out[16]:
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=42, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=42, ...)
In [17]:
# Predizioni
y_pred_XGBC = model_XGBC.predict(X_test)
In [18]:
# Calcolo delle metriche precision, recall e F1-score
metrics_report_XGBC = classification_report(y_test, y_pred_XGBC, target_names=['Classe 0', 'Classe 1'])
In [19]:
print(metrics_report_XGBC)
precision recall f1-score support Classe 0 0.97 0.98 0.98 2542 Classe 1 0.98 0.97 0.98 2469 accuracy 0.98 5011 macro avg 0.98 0.98 0.98 5011 weighted avg 0.98 0.98 0.98 5011
In [20]:
conf_matrix = confusion_matrix(y_test, y_pred_XGBC)
# Plot della matrice di confusione
# Creazione della figura con dimensioni personalizzate
fig, ax = plt.subplots(figsize=(20, 10)) # Modifica le dimensioni qui
# Creazione e visualizzazione della matrice di confusione
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=model_XGBC.classes_)
disp.plot(cmap='Purples', values_format='d', ax=ax) # Passa l'asse (ax) alla funzione plot
# Titolo
plt.title('Confusion Matrix XGBC', fontsize=16)
plt.savefig('./data/confusion_matrix_XGBC.png')
plt.show()
LGBMClassifier
In [21]:
# Separazione delle feature dalla variabile target
X = df.drop(columns=['Target'])
y = df['Target']
In [22]:
# Sovracampionamento della classe minoritaria
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
In [23]:
# Suddivisione dei dati in training e test set
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
In [24]:
model_lgbm = lgb.LGBMClassifier(random_state=42)
model_lgbm.fit(X_train, y_train)
[LightGBM] [Info] Number of positive: 5882, number of negative: 5809 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006558 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 9970 [LightGBM] [Info] Number of data points in the train set: 11691, number of used features: 49 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503122 -> initscore=0.012488 [LightGBM] [Info] Start training from score 0.012488
Out[24]:
LGBMClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LGBMClassifier(random_state=42)
In [25]:
y_pred_lgbm = model_lgbm.predict(X_test)
In [26]:
# Calcolo delle metriche precision, recall e F1-score
metrics_report_lgbm = classification_report(y_test, y_pred_lgbm, target_names=['Classe 0', 'Classe 1'])
In [27]:
print(metrics_report_lgbm)
precision recall f1-score support Classe 0 0.97 0.99 0.98 2542 Classe 1 0.99 0.97 0.98 2469 accuracy 0.98 5011 macro avg 0.98 0.98 0.98 5011 weighted avg 0.98 0.98 0.98 5011
In [28]:
conf_matrix = confusion_matrix(y_test, y_pred_lgbm)
# Plot della matrice di confusione
# Creazione della figura con dimensioni personalizzate
fig, ax = plt.subplots(figsize=(20, 10)) # Modifica le dimensioni qui
# Creazione e visualizzazione della matrice di confusione
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=model_lgbm.classes_)
disp.plot(cmap='Purples', values_format='d', ax=ax) # Passa l'asse (ax) alla funzione plot
# Titolo
plt.title('Confusion Matrix LightGBM', fontsize=16)
plt.savefig('./data/confusion_matrix_LightGBM.png')
plt.show()
SVC
In [29]:
# Separazione delle feature dalla variabile target
X = df.drop(columns=['Target'])
y = df['Target']
In [30]:
# Sovracampionamento della classe minoritaria
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
In [31]:
# Suddivisione dei dati in training e test set
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
In [32]:
model_svc = SVC(random_state=42)
model_svc.fit(X_train, y_train)
Out[32]:
SVC(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(random_state=42)
In [33]:
y_pred_svc = model_svc.predict(X_test)
In [34]:
# Calcolo delle metriche precision, recall e F1-score
metrics_report_svc = classification_report(y_test, y_pred_svc, target_names=['Classe 0', 'Classe 1'])
In [35]:
print(metrics_report_svc)
precision recall f1-score support Classe 0 0.95 0.78 0.86 2542 Classe 1 0.81 0.96 0.88 2469 accuracy 0.87 5011 macro avg 0.88 0.87 0.87 5011 weighted avg 0.88 0.87 0.87 5011
In [36]:
conf_matrix = confusion_matrix(y_test, y_pred_svc)
# Plot della matrice di confusione
# Creazione della figura con dimensioni personalizzate
fig, ax = plt.subplots(figsize=(20, 10)) # Modifica le dimensioni qui
# Creazione e visualizzazione della matrice di confusione
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=model_svc.classes_)
disp.plot(cmap='Purples', values_format='d', ax=ax) # Passa l'asse (ax) alla funzione plot
# Titolo
plt.title('Confusion Matrix SVC', fontsize=16)
plt.savefig('./data/confusion_matrix_SVC.png')
plt.show()
In [ ]: