Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions src/model_evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from __future__ import annotations

import json
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error

from logger import get_logger


logger = get_logger(__name__)


def _safe_mape(y_true: pd.Series, y_pred: pd.Series) -> float:
mask = y_true != 0
if not mask.any():
return float("nan")
return float(np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100)


def _group_metrics(df: pd.DataFrame, group_cols: list[str]) -> pd.DataFrame:
grouped = df.groupby(group_cols, dropna=False)
out = grouped.apply(
lambda g: pd.Series(
{
"mae": mean_absolute_error(g["y_true"], g["y_pred"]),
"rmse": float(
np.sqrt(mean_squared_error(g["y_true"], g["y_pred"]))
),
"mape": _safe_mape(g["y_true"], g["y_pred"]),
"count": len(g),
}
)
)
return out.reset_index()


def evaluate_model(
model,
X_test: pd.DataFrame,
y_test: pd.Series,
output_dir: Path | str = "artifacts/reports",
) -> dict:
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)

y_pred = model.predict(X_test)
overall = {
"mae": float(mean_absolute_error(y_test, y_pred)),
"rmse": float(np.sqrt(mean_squared_error(y_test, y_pred))),
"mape": _safe_mape(y_test, pd.Series(y_pred, index=y_test.index)),
"count": int(len(y_test)),
}

eval_df = X_test.copy()
eval_df["y_true"] = y_test.values
eval_df["y_pred"] = y_pred

by_store = _group_metrics(eval_df, ["Store ID"])
by_product = _group_metrics(eval_df, ["Product ID"])
by_store_product = _group_metrics(eval_df, ["Store ID", "Product ID"])
by_category = _group_metrics(eval_df, ["Category"])
by_region = _group_metrics(eval_df, ["Region"])

(output_path / "evaluation_summary.json").write_text(
json.dumps(overall, indent=2),
encoding="utf-8",
)
by_store.to_csv(output_path / "metrics_by_store.csv", index=False)
by_product.to_csv(output_path / "metrics_by_product.csv", index=False)
by_store_product.to_csv(output_path / "metrics_by_store_product.csv", index=False)
by_category.to_csv(output_path / "metrics_by_category.csv", index=False)
by_region.to_csv(output_path / "metrics_by_region.csv", index=False)

logger.info("Saved evaluation report to %s", output_path)
return overall
12 changes: 11 additions & 1 deletion src/model_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from pathlib import Path

import os

from data_ingestion import ingest
from data_transformation import transform
from model_evaluation import evaluate_model
from model_trainer import train_model
from logger import get_logger

Expand All @@ -13,7 +16,14 @@ def run() -> None:
data_path = Path("data") / "retail_store_inventory.csv"
df = ingest(data_path)
df = transform(df)
_, mae = train_model(df)
if os.getenv("FAST_TRAIN") == "1":
df = df.sample(n=20000, random_state=42)
model, mae, X_test, y_test = train_model(
df, n_estimators=80, random_state=42, return_data=True
)
else:
model, mae, X_test, y_test = train_model(df, return_data=True)
evaluate_model(model, X_test, y_test)
logger.info("Pipeline completed with MAE %s", mae)


Expand Down
3 changes: 3 additions & 0 deletions src/model_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def train_model(
df: pd.DataFrame,
n_estimators: int = 200,
random_state: int = 42,
return_data: bool = False,
):
target = "Units Sold"
features = [
Expand Down Expand Up @@ -115,4 +116,6 @@ def train_model(

save_best_model(pipeline, "mae", float(mae))
logger.info("Trained model with MAE %s", mae)
if return_data:
return pipeline, mae, X_test, y_test
return pipeline, mae