EDA 03: sovracampionatura con dimensionalità ridotta
In [1]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# XGBClassifier
from xgboost import XGBClassifier
# LGBMClassifier
import lightgbm as lgb
from sklearn.svm import SVC
Analisi dati
In [2]:
df = pd.read_parquet('./data/data.parquet')
In [3]:
df.drop(df.columns[52:91], axis=1, inplace=True)
In [4]:
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8759 entries, 0 to 8758 Data columns (total 53 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 V1 8759 non-null float64 1 V2 8759 non-null float64 2 V3 8759 non-null float64 3 V4 8759 non-null float64 4 V5 8759 non-null float64 5 V6 8759 non-null float64 6 V7 8759 non-null float64 7 V8 8759 non-null float64 8 V9 8759 non-null float64 9 V10 8759 non-null float64 10 V11 8759 non-null float64 11 V12 8759 non-null float64 12 V13 8759 non-null float64 13 V14 8759 non-null float64 14 V15 8759 non-null float64 15 V16 8759 non-null float64 16 V17 8759 non-null float64 17 V18 8759 non-null float64 18 V19 8759 non-null float64 19 V20 8759 non-null float64 20 V21 8759 non-null float64 21 V22 8759 non-null float64 22 V23 8759 non-null float64 23 V24 8759 non-null float64 24 V25 8759 non-null float64 25 V26 8759 non-null float64 26 V27 8759 non-null float64 27 V28 8759 non-null float64 28 V29 8759 non-null float64 29 V30 8759 non-null float64 30 V31 8759 non-null float64 31 V32 8759 non-null float64 32 V33 8759 non-null float64 33 V34 8759 non-null float64 34 V35 8759 non-null float64 35 V36 8759 non-null float64 36 V37 8759 non-null float64 37 V38 8759 non-null float64 38 V39 8759 non-null float64 39 V40 8759 non-null float64 40 V41 8759 non-null float64 41 V42 8759 non-null float64 42 V43 8759 non-null float64 43 V44 8759 non-null float64 44 V45 8759 non-null float64 45 V46 8759 non-null float64 46 V47 8759 non-null float64 47 V48 8759 non-null float64 48 V49 8759 non-null float64 49 V50 8759 non-null float64 50 V51 8759 non-null float64 51 V52 8759 non-null float64 52 Target 8759 non-null float64 dtypes: float64(53) memory usage: 3.5 MB
In [5]:
# Separazione delle feature dalla variabile target
X = df.drop(columns=['Target'])
y = df['Target']
In [6]:
# Sovracampionamento della classe minoritaria
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
In [7]:
# Suddivisione dei dati in training e test set
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
In [8]:
# Addestramento del modello (Random Forest)
model_rcf_rus = RandomForestClassifier(random_state=42)
model_rcf_rus.fit(X_train, y_train)
In [9]:
# Predizioni
y_pred = model_rcf_rus.predict(X_test)
In [10]:
# Calcolo delle metriche precision, recall e F1-score
metrics_report = classification_report(y_test, y_pred, target_names=['Classe 0', 'Classe 1'])
In [11]:
precision recall f1-score support Classe 0 0.99 0.97 0.98 2542 Classe 1 0.97 0.99 0.98 2469 accuracy 0.98 5011 macro avg 0.98 0.98 0.98 5011 weighted avg 0.98 0.98 0.98 5011
In [12]:
conf_matrix = confusion_matrix(y_test, y_pred)
# Plot della matrice di confusione
# Creazione della figura con dimensioni personalizzate
fig, ax = plt.subplots(figsize=(20, 10)) # Modifica le dimensioni qui
# Creazione e visualizzazione della matrice di confusione
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=model_rcf_rus.classes_)
disp.plot(cmap='Purples', values_format='d', ax=ax) # Passa l'asse (ax) alla funzione plot
# Titolo
plt.title('Confusion Matrix', fontsize=16)
In [13]:
# Separazione delle feature dalla variabile target
X = df.drop(columns=['Target'])
y = df['Target']
In [14]:
# Sovracampionamento della classe minoritaria
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
In [15]:
# Suddivisione dei dati in training e test set
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
In [16]:
model_XGBC = XGBClassifier(random_state=42)
model_XGBC.fit(X_train, y_train)
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=42, ...)
In [17]:
# Predizioni
y_pred_XGBC = model_XGBC.predict(X_test)
In [18]:
# Calcolo delle metriche precision, recall e F1-score
metrics_report_XGBC = classification_report(y_test, y_pred_XGBC, target_names=['Classe 0', 'Classe 1'])
In [19]:
precision recall f1-score support Classe 0 0.97 0.98 0.98 2542 Classe 1 0.98 0.97 0.98 2469 accuracy 0.98 5011 macro avg 0.98 0.98 0.98 5011 weighted avg 0.98 0.98 0.98 5011
In [20]:
conf_matrix = confusion_matrix(y_test, y_pred_XGBC)
# Plot della matrice di confusione
# Creazione della figura con dimensioni personalizzate
fig, ax = plt.subplots(figsize=(20, 10)) # Modifica le dimensioni qui
# Creazione e visualizzazione della matrice di confusione
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=model_XGBC.classes_)
disp.plot(cmap='Purples', values_format='d', ax=ax) # Passa l'asse (ax) alla funzione plot
# Titolo
plt.title('Confusion Matrix XGBC', fontsize=16)
In [21]:
# Separazione delle feature dalla variabile target
X = df.drop(columns=['Target'])
y = df['Target']
In [22]:
# Sovracampionamento della classe minoritaria
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
In [23]:
# Suddivisione dei dati in training e test set
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
In [24]:
model_lgbm = lgb.LGBMClassifier(random_state=42)
model_lgbm.fit(X_train, y_train)
[LightGBM] [Info] Number of positive: 5882, number of negative: 5809 [LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006558 seconds. You can set `force_col_wise=true` to remove the overhead. [LightGBM] [Info] Total Bins 9970 [LightGBM] [Info] Number of data points in the train set: 11691, number of used features: 49 [LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503122 -> initscore=0.012488 [LightGBM] [Info] Start training from score 0.012488
In [25]:
y_pred_lgbm = model_lgbm.predict(X_test)
In [26]:
# Calcolo delle metriche precision, recall e F1-score
metrics_report_lgbm = classification_report(y_test, y_pred_lgbm, target_names=['Classe 0', 'Classe 1'])
In [27]:
precision recall f1-score support Classe 0 0.97 0.99 0.98 2542 Classe 1 0.99 0.97 0.98 2469 accuracy 0.98 5011 macro avg 0.98 0.98 0.98 5011 weighted avg 0.98 0.98 0.98 5011
In [28]:
conf_matrix = confusion_matrix(y_test, y_pred_lgbm)
# Plot della matrice di confusione
# Creazione della figura con dimensioni personalizzate
fig, ax = plt.subplots(figsize=(20, 10)) # Modifica le dimensioni qui
# Creazione e visualizzazione della matrice di confusione
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=model_lgbm.classes_)
disp.plot(cmap='Purples', values_format='d', ax=ax) # Passa l'asse (ax) alla funzione plot
# Titolo
plt.title('Confusion Matrix LightGBM', fontsize=16)
In [29]:
# Separazione delle feature dalla variabile target
X = df.drop(columns=['Target'])
y = df['Target']
In [30]:
# Sovracampionamento della classe minoritaria
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
In [31]:
# Suddivisione dei dati in training e test set
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
In [32]:
model_svc = SVC(random_state=42)
model_svc.fit(X_train, y_train)
In [33]:
y_pred_svc = model_svc.predict(X_test)
In [34]:
# Calcolo delle metriche precision, recall e F1-score
metrics_report_svc = classification_report(y_test, y_pred_svc, target_names=['Classe 0', 'Classe 1'])
In [35]:
precision recall f1-score support Classe 0 0.95 0.78 0.86 2542 Classe 1 0.81 0.96 0.88 2469 accuracy 0.87 5011 macro avg 0.88 0.87 0.87 5011 weighted avg 0.88 0.87 0.87 5011
In [36]:
conf_matrix = confusion_matrix(y_test, y_pred_svc)
# Plot della matrice di confusione
# Creazione della figura con dimensioni personalizzate
fig, ax = plt.subplots(figsize=(20, 10)) # Modifica le dimensioni qui
# Creazione e visualizzazione della matrice di confusione
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=model_svc.classes_)
disp.plot(cmap='Purples', values_format='d', ax=ax) # Passa l'asse (ax) alla funzione plot
# Titolo
plt.title('Confusion Matrix SVC', fontsize=16)
In [ ]: