🧠 PathogenHawk Demo: Candida auris AMR Prediction¶
In [1]:
import yaml
import pandas as pd
from feature_engineering.build_features import build_features
from ml_model.evaluate_model import evaluate_model
from ml_model.interpret_model import interpret_model
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
🔧 Load Configuration¶
In [2]:
with open("configs/Cauris.yaml", "r") as f:
config = yaml.safe_load(f)
📊 Step 1: Load Features and Labels¶
In [3]:
X, y = build_features(config)
print("✅ Features shape:", X.shape)
print("✅ Labels:", pd.Series(y).value_counts().to_dict())
✅ Features shape: (50, 10)
✅ Labels: {0: 25, 1: 25}
🔁 Step 2: Split Train/Test¶
In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
🤖 Step 3: Train ML Model¶
In [5]:
# 🤖 Step 3: Train ML Model (Updated with warning suppression)
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
model = XGBClassifier(
objective="binary:logistic",
eval_metric="logloss",
base_score=0.5
)
model.fit(X_train, y_train)
Out[5]:
XGBClassifier(base_score=0.5, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric='logloss',
feature_types=None, feature_weights=None, gamma=None,
grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
| objective | 'binary:logistic' | |
| base_score | 0.5 | |
| booster | None | |
| callbacks | None | |
| colsample_bylevel | None | |
| colsample_bynode | None | |
| colsample_bytree | None | |
| device | None | |
| early_stopping_rounds | None | |
| enable_categorical | False | |
| eval_metric | 'logloss' | |
| feature_types | None | |
| feature_weights | None | |
| gamma | None | |
| grow_policy | None | |
| importance_type | None | |
| interaction_constraints | None | |
| learning_rate | None | |
| max_bin | None | |
| max_cat_threshold | None | |
| max_cat_to_onehot | None | |
| max_delta_step | None | |
| max_depth | None | |
| max_leaves | None | |
| min_child_weight | None | |
| missing | nan | |
| monotone_constraints | None | |
| multi_strategy | None | |
| n_estimators | None | |
| n_jobs | None | |
| num_parallel_tree | None | |
| random_state | None | |
| reg_alpha | None | |
| reg_lambda | None | |
| sampling_method | None | |
| scale_pos_weight | None | |
| subsample | None | |
| tree_method | None | |
| validate_parameters | None | |
| verbosity | None |
📈 Step 4: Evaluate Model¶
In [6]:
evaluate_model(model, X_test, y_test)
🔍 Classification Report:
precision recall f1-score support
0 0.80 0.80 0.80 5
1 0.88 0.88 0.88 8
accuracy 0.85 13
macro avg 0.84 0.84 0.84 13
weighted avg 0.85 0.85 0.85 13
✅ Accuracy: 0.8461538461538461
🔬 Step 5: Interpret Feature Importance¶
In [7]:
interpret_model(model, X.columns)