🧠 PathogenHawk Demo: Candida auris AMR Prediction¶

In [1]:
import yaml
import pandas as pd
from feature_engineering.build_features import build_features
from ml_model.evaluate_model import evaluate_model
from ml_model.interpret_model import interpret_model
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

🔧 Load Configuration¶

In [2]:
with open("configs/Cauris.yaml", "r") as f:
    config = yaml.safe_load(f)

📊 Step 1: Load Features and Labels¶

In [3]:
X, y = build_features(config)
print("✅ Features shape:", X.shape)
print("✅ Labels:", pd.Series(y).value_counts().to_dict())
✅ Features shape: (50, 10)
✅ Labels: {0: 25, 1: 25}

🔁 Step 2: Split Train/Test¶

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

🤖 Step 3: Train ML Model¶

In [5]:
# 🤖 Step 3: Train ML Model (Updated with warning suppression)
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

model = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    base_score=0.5
)
model.fit(X_train, y_train)
Out[5]:
XGBClassifier(base_score=0.5, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
objective  'binary:logistic'
base_score  0.5
booster  None
callbacks  None
colsample_bylevel  None
colsample_bynode  None
colsample_bytree  None
device  None
early_stopping_rounds  None
enable_categorical  False
eval_metric  'logloss'
feature_types  None
feature_weights  None
gamma  None
grow_policy  None
importance_type  None
interaction_constraints  None
learning_rate  None
max_bin  None
max_cat_threshold  None
max_cat_to_onehot  None
max_delta_step  None
max_depth  None
max_leaves  None
min_child_weight  None
missing  nan
monotone_constraints  None
multi_strategy  None
n_estimators  None
n_jobs  None
num_parallel_tree  None
random_state  None
reg_alpha  None
reg_lambda  None
sampling_method  None
scale_pos_weight  None
subsample  None
tree_method  None
validate_parameters  None
verbosity  None

📈 Step 4: Evaluate Model¶

In [6]:
evaluate_model(model, X_test, y_test)
🔍 Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80         5
           1       0.88      0.88      0.88         8

    accuracy                           0.85        13
   macro avg       0.84      0.84      0.84        13
weighted avg       0.85      0.85      0.85        13

✅ Accuracy: 0.8461538461538461
No description has been provided for this image

🔬 Step 5: Interpret Feature Importance¶

In [7]:
interpret_model(model, X.columns)
No description has been provided for this image