import yaml
import pandas as pd
from feature_engineering.build_features import build_features
from ml_model.evaluate_model import evaluate_model
from ml_model.interpret_model import interpret_model
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

with open("configs/Cauris.yaml", "r") as f:
    config = yaml.safe_load(f)

X, y = build_features(config)
print("✅ Features shape:", X.shape)
print("✅ Labels:", pd.Series(y).value_counts().to_dict())

✅ Features shape: (50, 10)
✅ Labels: {0: 25, 1: 25}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# 🤖 Step 3: Train ML Model (Updated with warning suppression)
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

model = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    base_score=0.5
)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, ...)

evaluate_model(model, X_test, y_test)

🔍 Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80         5
           1       0.88      0.88      0.88         8

    accuracy                           0.85        13
   macro avg       0.84      0.84      0.84        13
weighted avg       0.85      0.85      0.85        13

✅ Accuracy: 0.8461538461538461

interpret_model(model, X.columns)

🧠 PathogenHawk Demo: Candida auris AMR Prediction¶

🔧 Load Configuration¶

📊 Step 1: Load Features and Labels¶

🔁 Step 2: Split Train/Test¶

🤖 Step 3: Train ML Model¶

📈 Step 4: Evaluate Model¶

🔬 Step 5: Interpret Feature Importance¶

	objective	'binary:logistic'
	base_score	0.5
	booster	None
	callbacks	None
	colsample_bylevel	None
	colsample_bynode	None
	colsample_bytree	None
	device	None
	early_stopping_rounds	None
	enable_categorical	False
	eval_metric	'logloss'
	feature_types	None
	feature_weights	None
	gamma	None
	grow_policy	None
	importance_type	None
	interaction_constraints	None
	learning_rate	None
	max_bin	None
	max_cat_threshold	None
	max_cat_to_onehot	None
	max_delta_step	None
	max_depth	None
	max_leaves	None
	min_child_weight	None
	missing	nan
	monotone_constraints	None
	multi_strategy	None
	n_estimators	None
	n_jobs	None
	num_parallel_tree	None
	random_state	None
	reg_alpha	None
	reg_lambda	None
	sampling_method	None
	scale_pos_weight	None
	subsample	None
	tree_method	None
	validate_parameters	None
	verbosity	None