From 1a0a193081d48660a8b7712850837ea2e35cdb6b Mon Sep 17 00:00:00 2001
From: Gernot Maier <gernot.maier@desy.de>
Date: Mon, 11 May 2026 09:54:33 +0200
Subject: [PATCH 01/14] add TMVA style training

---
 src/eventdisplay_ml/data_processing.py |  1 +
 src/eventdisplay_ml/features.py        | 29 ++++++++++++++++++++++++--
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/src/eventdisplay_ml/data_processing.py b/src/eventdisplay_ml/data_processing.py
index b9fa59b..a845fb7 100644
--- a/src/eventdisplay_ml/data_processing.py
+++ b/src/eventdisplay_ml/data_processing.py
@@ -1195,6 +1195,7 @@ def extra_columns(df, analysis_type, training, index, tel_config=None, observato
             "EChi2S": _to_numpy_1d(df["EChi2S"], np.float32),
             "EmissionHeight": _to_numpy_1d(df["EmissionHeight"], np.float32),
             "EmissionHeightChi2": _to_numpy_1d(df["EmissionHeightChi2"], np.float32),
+            "SizeSecondMax": _to_numpy_1d(df["SizeSecondMax"], np.float32),
         }
         if not training:
             data["ze_bin"] = _to_numpy_1d(df["ze_bin"], np.float32)
diff --git a/src/eventdisplay_ml/features.py b/src/eventdisplay_ml/features.py
index b4818d7..9a069d6 100644
--- a/src/eventdisplay_ml/features.py
+++ b/src/eventdisplay_ml/features.py
@@ -136,8 +136,33 @@ def _regression_features(training):
     return var
 
 
-def _classification_features():
-    """Classification features."""
+def _classification_features(tmva_style=False):
+    """
+    Classification features.
+
+    Parameters
+    ----------
+    tmva_style : bool, optional
+        If True, return features matching TMVA BDT input (default: False).
+
+    Returns
+    -------
+    list
+        List of feature names.
+    """
+    if tmva_style:
+        return [
+            "DispNImages",
+            "EChi2S",
+            "EmissionHeight",
+            "EmissionHeightChi2",
+            "MSCW",
+            "MSCL",
+            "SizeSecondMax",
+            "ArrayPointing_Elevation",
+            "ArrayPointing_Azimuth",
+        ]
+
     var_tel = telescope_features("classification")
     var_array = [
         "DispNImages",

From 754ed46f4c237e8560afe24c516e9048f5c9acf8 Mon Sep 17 00:00:00 2001
From: Gernot Maier <gernot.maier@desy.de>
Date: Mon, 11 May 2026 10:11:34 +0200
Subject: [PATCH 02/14] add tmva style

---
 src/eventdisplay_ml/data_processing.py | 34 +++++++++++++++++---------
 src/eventdisplay_ml/features.py        | 25 +++++++++++++++++++
 2 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/src/eventdisplay_ml/data_processing.py b/src/eventdisplay_ml/data_processing.py
index a845fb7..a66886c 100644
--- a/src/eventdisplay_ml/data_processing.py
+++ b/src/eventdisplay_ml/data_processing.py
@@ -797,7 +797,12 @@ def load_training_data(model_configs, file_list, analysis_type):
 
     input_files = utils.read_input_file_list(file_list)
 
-    branch_list = features_module.features(analysis_type, training=True)
+    tmva_style = model_configs.get("tmva_style", False)
+    if tmva_style and analysis_type == "classification":
+        _logger.info("Using TMVA-style features for classification")
+        branch_list = features_module.features_tmva_style(analysis_type, training=True)
+    else:
+        branch_list = features_module.features(analysis_type, training=True)
     _logger.info(f"Branch list: {branch_list}")
     if max_events is not None and max_events > 0:
         max_events_per_file = max_events // len(input_files)
@@ -860,17 +865,22 @@ def load_training_data(model_configs, file_list, analysis_type):
                     f" (fraction retained: {len(df) / n_before:.4f})"
                 )
 
-                df_flat = flatten_telescope_data_vectorized(
-                    df,
-                    tel_config["max_tel_id"] + 1,
-                    features_module.telescope_features(analysis_type),
-                    analysis_type,
-                    training=True,
-                    tel_config=tel_config,
-                    observatory=model_configs.get("observatory", "veritas"),
-                    max_tel_per_type=model_configs.get("max_tel_per_type", None),
-                    preview_rows=model_configs.get("preview_rows", 20),
-                )
+                # For TMVA-style classification, skip telescope flattening (use event-level features only)
+                if tmva_style and analysis_type == "classification":
+                    _logger.info("Converting to pandas (no telescope flattening for TMVA style)")
+                    df_flat = ak.to_pandas(df)
+                else:
+                    df_flat = flatten_telescope_data_vectorized(
+                        df,
+                        tel_config["max_tel_id"] + 1,
+                        features_module.telescope_features(analysis_type),
+                        analysis_type,
+                        training=True,
+                        tel_config=tel_config,
+                        observatory=model_configs.get("observatory", "veritas"),
+                        max_tel_per_type=model_configs.get("max_tel_per_type", None),
+                        preview_rows=model_configs.get("preview_rows", 20),
+                    )
 
                 # Filter out events with invalid energy reconstruction for stereo training
                 if analysis_type == "stereo_analysis":
diff --git a/src/eventdisplay_ml/features.py b/src/eventdisplay_ml/features.py
index 9a069d6..acfbf12 100644
--- a/src/eventdisplay_ml/features.py
+++ b/src/eventdisplay_ml/features.py
@@ -1,6 +1,31 @@
 """Features used for XGB training and prediction."""
 
 
+def features_tmva_style(analysis_type, training=True):
+    """
+    Get TMVA-style features for classification analysis.
+
+    Parameters
+    ----------
+    analysis_type : str
+        Type of analysis.
+    training : bool, optional
+        If True (default), return features including target features.
+        If False, return only non-target features (i.e. features used
+        for prediction).
+
+    Returns
+    -------
+    list
+        List of feature names.
+    """
+    if analysis_type == "stereo_analysis":
+        return _regression_features(training)
+    if "classification" in analysis_type:
+        return _classification_features(tmva_style=True)
+    raise ValueError(f"Unknown analysis type: {analysis_type}")
+
+
 def target_features(analysis_type):
     """
     Get target features based on analysis type.

From da2b99eebc0e59c0cf2b1f05ff79ed75577e663d Mon Sep 17 00:00:00 2001
From: Gernot Maier <gernot.maier@desy.de>
Date: Mon, 11 May 2026 10:38:15 +0200
Subject: [PATCH 03/14] optional fields

---
 src/eventdisplay_ml/data_processing.py        | 13 +++++++++--
 ...test_classification_apply_interpolation.py | 22 +++++++++++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/src/eventdisplay_ml/data_processing.py b/src/eventdisplay_ml/data_processing.py
index a66886c..33bb01b 100644
--- a/src/eventdisplay_ml/data_processing.py
+++ b/src/eventdisplay_ml/data_processing.py
@@ -130,7 +130,15 @@ def _resolve_branch_aliases(tree, branch_list):
     resolved = [b for b in resolved if b not in synthesized]
 
     # Drop missing optional branches
-    optional = {"fpointing_dx", "fpointing_dy", "E", "Erec", "ErecS", "nlowgain"}
+    optional = {
+        "fpointing_dx",
+        "fpointing_dy",
+        "E",
+        "Erec",
+        "ErecS",
+        "nlowgain",
+        "SizeSecondMax",
+    }
     final = [b for b in resolved if b not in optional or b in keys]
 
     return final, rename
@@ -1205,8 +1213,9 @@ def extra_columns(df, analysis_type, training, index, tel_config=None, observato
             "EChi2S": _to_numpy_1d(df["EChi2S"], np.float32),
             "EmissionHeight": _to_numpy_1d(df["EmissionHeight"], np.float32),
             "EmissionHeightChi2": _to_numpy_1d(df["EmissionHeightChi2"], np.float32),
-            "SizeSecondMax": _to_numpy_1d(df["SizeSecondMax"], np.float32),
         }
+        if _has_field(df, "SizeSecondMax"):
+            data["SizeSecondMax"] = _to_numpy_1d(df["SizeSecondMax"], np.float32)
         if not training:
             data["ze_bin"] = _to_numpy_1d(df["ze_bin"], np.float32)
 
diff --git a/tests/test_classification_apply_interpolation.py b/tests/test_classification_apply_interpolation.py
index a0d0e67..198d71e 100644
--- a/tests/test_classification_apply_interpolation.py
+++ b/tests/test_classification_apply_interpolation.py
@@ -70,3 +70,25 @@ def test_apply_classification_models_interpolates_probabilities_and_thresholds(m
 
     # Thresholds are interpolated the same way: [0.5, 0.7]
     np.testing.assert_array_equal(is_gamma[50], np.array([0, 1], dtype=np.uint8))
+
+
+def test_extra_columns_skip_tmva_only_size_second_max_when_branch_missing():
+    """Standard classification should not synthesize the TMVA-only SizeSecondMax column."""
+    df = pd.DataFrame(
+        {
+            "MSCW": [0.1, 0.2],
+            "MSCL": [0.3, 0.4],
+            "EChi2S": [1.0, 2.0],
+            "EmissionHeight": [10.0, 20.0],
+            "EmissionHeightChi2": [0.5, 0.25],
+        }
+    )
+
+    result = data_processing.extra_columns(
+        df,
+        analysis_type="classification",
+        training=True,
+        index=pd.RangeIndex(2),
+    )
+
+    assert "SizeSecondMax" not in result.columns

From 7aae25ea268ced849884fb01409d0ad715ba17c4 Mon Sep 17 00:00:00 2001
From: Gernot Maier <gernot.maier@desy.de>
Date: Mon, 11 May 2026 13:22:55 +0200
Subject: [PATCH 04/14] read tmva style

---
 src/eventdisplay_ml/config.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/eventdisplay_ml/config.py b/src/eventdisplay_ml/config.py
index 860bbca..07b47db 100644
--- a/src/eventdisplay_ml/config.py
+++ b/src/eventdisplay_ml/config.py
@@ -149,12 +149,23 @@ def configure_training(analysis_type):
         model_parameters = utils.load_model_parameters(
             model_configs["model_parameters"], model_configs["energy_bin_number"]
         )
+        tmva_style_raw = model_parameters.get("tmva_style", False)
+        if isinstance(tmva_style_raw, str):
+            model_configs["tmva_style"] = tmva_style_raw.strip().lower() in {
+                "1",
+                "true",
+                "yes",
+                "on",
+            }
+        else:
+            model_configs["tmva_style"] = bool(tmva_style_raw)
         model_configs["pre_cuts"] = pre_cuts_classification(
             e_min=np.power(10.0, model_parameters.get("energy_bins_log10_tev", []).get("E_min")),
             e_max=np.power(10.0, model_parameters.get("energy_bins_log10_tev", []).get("E_max")),
         )
         model_configs["energy_bins_log10_tev"] = model_parameters.get("energy_bins_log10_tev", [])
         model_configs["zenith_bins_deg"] = model_parameters.get("zenith_bins_deg", [])
+        _logger.info(f"TMVA-style classification: {model_configs['tmva_style']}")
 
     _logger.info(f"Pre-cuts: {model_configs['pre_cuts']}")
 

From e4248a4a22a84654c0728fda4d091b77082e9f99 Mon Sep 17 00:00:00 2001
From: Gernot Maier <gernot.maier@desy.de>
Date: Mon, 11 May 2026 13:26:24 +0200
Subject: [PATCH 05/14] read tmva style

---
 src/eventdisplay_ml/data_processing.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/eventdisplay_ml/data_processing.py b/src/eventdisplay_ml/data_processing.py
index 33bb01b..4b32a11 100644
--- a/src/eventdisplay_ml/data_processing.py
+++ b/src/eventdisplay_ml/data_processing.py
@@ -876,7 +876,9 @@ def load_training_data(model_configs, file_list, analysis_type):
                 # For TMVA-style classification, skip telescope flattening (use event-level features only)
                 if tmva_style and analysis_type == "classification":
                     _logger.info("Converting to pandas (no telescope flattening for TMVA style)")
-                    df_flat = ak.to_pandas(df)
+                    # Build DataFrame directly to stay compatible across awkward versions
+                    # (some versions do not provide ak.to_pandas).
+                    df_flat = pd.DataFrame({name: _to_numpy_1d(df[name]) for name in df.fields})
                 else:
                     df_flat = flatten_telescope_data_vectorized(
                         df,

From 1c05bb6be0f6435deff0a087fd784bb7606475ca Mon Sep 17 00:00:00 2001
From: Gernot Maier <gernot.maier@desy.de>
Date: Mon, 11 May 2026 14:23:54 +0200
Subject: [PATCH 06/14] add signal events

---
 src/eventdisplay_ml/evaluate.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/eventdisplay_ml/evaluate.py b/src/eventdisplay_ml/evaluate.py
index 54167d8..8b0d4e5 100644
--- a/src/eventdisplay_ml/evaluate.py
+++ b/src/eventdisplay_ml/evaluate.py
@@ -41,8 +41,8 @@ def evaluation_efficiency(name, model, x_test, y_test):
             "threshold": thresholds,
             "signal_efficiency": eff_signal,
             "background_efficiency": eff_background,
-            "n_signal": n_signal,
-            "n_background": n_background,
+            "n_signal": n_signal * eff_signal,
+            "n_background": n_background * eff_background,
         }
     )
 

From f547ac07780663cf5f1a88c872c7f2d6cafb8307 Mon Sep 17 00:00:00 2001
From: Gernot Maier <gernot.maier@desy.de>
Date: Tue, 12 May 2026 09:32:36 +0200
Subject: [PATCH 07/14] correct signal efficiency

---
 src/eventdisplay_ml/evaluate.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/eventdisplay_ml/evaluate.py b/src/eventdisplay_ml/evaluate.py
index 8b0d4e5..fe93b17 100644
--- a/src/eventdisplay_ml/evaluate.py
+++ b/src/eventdisplay_ml/evaluate.py
@@ -36,6 +36,9 @@ def evaluation_efficiency(name, model, x_test, y_test):
             f"Background Efficiency: {eff_background[-1]:.4f}"
         )
 
+    eff_signal = np.asarray(eff_signal, dtype=float)
+    eff_background = np.asarray(eff_background, dtype=float)
+
     return pd.DataFrame(
         {
             "threshold": thresholds,

From 6db8b3ee65296677172cc7c3791a9056e4ee1ecb Mon Sep 17 00:00:00 2001
From: Gernot Maier <gernot.maier@desy.de>
Date: Tue, 12 May 2026 17:08:02 +0200
Subject: [PATCH 08/14] set pointing variables correctly

---
 src/eventdisplay_ml/data_processing.py | 13 +++++++++++++
 src/eventdisplay_ml/features.py        |  3 +--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/eventdisplay_ml/data_processing.py b/src/eventdisplay_ml/data_processing.py
index 4b32a11..5f2524f 100644
--- a/src/eventdisplay_ml/data_processing.py
+++ b/src/eventdisplay_ml/data_processing.py
@@ -809,6 +809,11 @@ def load_training_data(model_configs, file_list, analysis_type):
     if tmva_style and analysis_type == "classification":
         _logger.info("Using TMVA-style features for classification")
         branch_list = features_module.features_tmva_style(analysis_type, training=True)
+        # ze_bin is a derived feature and not present in ROOT input.
+        # Read elevation as auxiliary input to derive ze_bin.
+        branch_list = [b for b in branch_list if b not in {"ze_bin", "ArrayPointing_Azimuth"}] + [
+            "ArrayPointing_Elevation"
+        ]
     else:
         branch_list = features_module.features(analysis_type, training=True)
     _logger.info(f"Branch list: {branch_list}")
@@ -932,6 +937,14 @@ def load_training_data(model_configs, file_list, analysis_type):
                 for col_name, values in new_cols.items():
                     df_flat[col_name] = values
 
+                # For TMVA-style classification, keep pointing only as an intermediate
+                # to compute ze_bin, but do not expose raw pointing as ML features.
+                if tmva_style and analysis_type == "classification":
+                    df_flat = df_flat.drop(
+                        columns=["ArrayPointing_Elevation", "ArrayPointing_Azimuth"],
+                        errors="ignore",
+                    )
+
                 # Filter out events with NaN in residuals (can't train on these)
                 if analysis_type == "stereo_analysis":
                     n_before_nan_filter = len(df_flat)
diff --git a/src/eventdisplay_ml/features.py b/src/eventdisplay_ml/features.py
index acfbf12..0e576b4 100644
--- a/src/eventdisplay_ml/features.py
+++ b/src/eventdisplay_ml/features.py
@@ -184,8 +184,7 @@ def _classification_features(tmva_style=False):
             "MSCW",
             "MSCL",
             "SizeSecondMax",
-            "ArrayPointing_Elevation",
-            "ArrayPointing_Azimuth",
+            "ze_bin",
         ]
 
     var_tel = telescope_features("classification")

From f1a05a4f063b8bcb67d29a280adb90f2a9aa2ee6 Mon Sep 17 00:00:00 2001
From: Gernot Maier <gernot.maier@desy.de>
Date: Thu, 14 May 2026 10:20:08 +0200
Subject: [PATCH 09/14] features tmva style

---
 src/eventdisplay_ml/features.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/eventdisplay_ml/features.py b/src/eventdisplay_ml/features.py
index 0e576b4..d4b277d 100644
--- a/src/eventdisplay_ml/features.py
+++ b/src/eventdisplay_ml/features.py
@@ -20,7 +20,7 @@ def features_tmva_style(analysis_type, training=True):
         List of feature names.
     """
     if analysis_type == "stereo_analysis":
-        return _regression_features(training)
+        raise ValueError("TMVA-style features are only defined for classification analysis.")
     if "classification" in analysis_type:
         return _classification_features(tmva_style=True)
     raise ValueError(f"Unknown analysis type: {analysis_type}")
@@ -92,7 +92,7 @@ def telescope_features(analysis_type):
     Parameters
     ----------
     analysis_type : str
-        Type of analysis, e.g. ``"classification"`` or ``"stereo_analysis"``.
+        Type of analysis.
 
     Returns
     -------
@@ -176,6 +176,7 @@ def _classification_features(tmva_style=False):
         List of feature names.
     """
     if tmva_style:
+        # Base features from ROOT file needed to compute derived features
         return [
             "DispNImages",
             "EChi2S",
@@ -184,7 +185,10 @@ def _classification_features(tmva_style=False):
             "MSCW",
             "MSCL",
             "SizeSecondMax",
-            "ze_bin",
+            "Xcore",
+            "Ycore",
+            "DispAbsSumWeigth",
+            "ArrayPointing_Elevation",
         ]
 
     var_tel = telescope_features("classification")
@@ -211,6 +215,9 @@ def clip_intervals():
     """
     Get clip intervals for variables.
 
+    Clip intervals are defined based on physical bounds or ranges used
+    in the classical TMVA BDT analysis.
+
     Returns
     -------
     dict
@@ -232,8 +239,8 @@ def clip_intervals():
         # Energy-related variables - log10 transformation with lower bound
         "Erec": (energy_min, None),
         "ErecS": (energy_min, None),
-        "EChi2S": (energy_min, None),
-        "EmissionHeightChi2": (1e-6, None),
+        "EChi2S": (energy_min, 10000.0),  # TMVA: -6 to 4 (before log10)
+        "EmissionHeightChi2": (1e-6, 10000.0),  # TMVA: -11 to 4 (before log10)
         "img2_ang": (0.0, 360.0),
         # Per-telescope energy and size variables - log10 transformation with lower bound
         "size": (1, None),
@@ -244,8 +251,13 @@ def clip_intervals():
         # Derived variables - avoid numerical issues
         "size_dist2": (1.0, None),
         "tgrad_x": (-50.0, 50.0),
+        "MSCW": (-2.0, 2.0),
+        "MSCL": (-2.0, 5.0),
+        "SizeSecondMax": (1e-6, 100000.0),  # TMVA: 0 to 5 (after log10)
+        "Core_Distance": (0.0, 1000.0),  # sqrt(Xcore^2 + Ycore^2)
+        "DispAbsSumWeigth": (0.0, 5.0),  # TMVA: 0 to 5
         # Physical bounds
-        "EmissionHeight": (0, 120),  # top of atmosphere
+        "EmissionHeight": (0, 100),  # top of atmosphere
         "R_core": (-10, None),  # badly reconstructed events
     }
 

From 7acd328888ab97b1f2c36910817ea7ccf8f3dd2b Mon Sep 17 00:00:00 2001
From: Gernot Maier <gernot.maier@desy.de>
Date: Thu, 14 May 2026 10:24:41 +0200
Subject: [PATCH 10/14] correct tmva variables

---
 src/eventdisplay_ml/data_processing.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/eventdisplay_ml/data_processing.py b/src/eventdisplay_ml/data_processing.py
index 5f2524f..5e4cb21 100644
--- a/src/eventdisplay_ml/data_processing.py
+++ b/src/eventdisplay_ml/data_processing.py
@@ -51,7 +51,6 @@ def read_telescope_config(root_file):
 
     n_tel = int(telconfig_data["NTel"][0])
     tel_ids = telconfig_data["TelID"]
-    # Keep array of mirror areas; avoid shadowing this name later
     mirror_area_arr = telconfig_data["MirrorArea"]
     tel_x = telconfig_data["TelX"]
     tel_y = telconfig_data["TelY"]
@@ -1222,6 +1221,7 @@ def extra_columns(df, analysis_type, training, index, tel_config=None, observato
             tel_list_matrix = _to_dense_array(df["DispTelList_T"])
             data["array_footprint"] = _calculate_array_footprint(tel_config, tel_list_matrix)
     elif "classification" in analysis_type:
+        n = len(index)
         data = {
             "MSCW": _to_numpy_1d(df["MSCW"], np.float32),
             "MSCL": _to_numpy_1d(df["MSCL"], np.float32),
@@ -1231,6 +1231,17 @@ def extra_columns(df, analysis_type, training, index, tel_config=None, observato
         }
         if _has_field(df, "SizeSecondMax"):
             data["SizeSecondMax"] = _to_numpy_1d(df["SizeSecondMax"], np.float32)
+        # Add core distance: sqrt(Xcore^2 + Ycore^2)
+        if _has_field(df, "Xcore") and _has_field(df, "Ycore"):
+            xcore = _to_numpy_1d(df["Xcore"], np.float32)
+            ycore = _to_numpy_1d(df["Ycore"], np.float32)
+            data["Core_Distance"] = np.sqrt(xcore**2 + ycore**2).astype(np.float32)
+        else:
+            data["Core_Distance"] = np.full(n, DEFAULT_FILL_VALUE, dtype=np.float32)
+        if _has_field(df, "DispAbsSumWeigth"):
+            data["DispAbsSumWeigth"] = _to_numpy_1d(df["DispAbsSumWeigth"], np.float32)
+        else:
+            data["DispAbsSumWeigth"] = np.full(n, DEFAULT_FILL_VALUE, dtype=np.float32)
         if not training:
             data["ze_bin"] = _to_numpy_1d(df["ze_bin"], np.float32)
 
@@ -1249,6 +1260,7 @@ def extra_columns(df, analysis_type, training, index, tel_config=None, observato
             "EmissionHeightChi2",
             "Erec",
             "ErecS",
+            "SizeSecondMax",
         ]
     apply_clip_intervals(
         df_extra,
@@ -1283,6 +1295,8 @@ def energy_in_bins(df_chunk, bins):
 def energy_interpolation_bins(df_chunk, bins):
     """Compute neighboring energy bins and interpolation weights per event.
 
+    Allows to interpolate downstream using 'value = (1 - alpha) * value_lo + alpha * value_hi'.
+
     Parameters
     ----------
     df_chunk : pandas.DataFrame

From 92e06ae057d5805dd32e125dba906bbff5bd854c Mon Sep 17 00:00:00 2001
From: Gernot Maier <gernot.maier@desy.de>
Date: Thu, 14 May 2026 10:34:05 +0200
Subject: [PATCH 11/14] unit tests

---
 tests/test_data_processing.py | 48 +++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 tests/test_data_processing.py

diff --git a/tests/test_data_processing.py b/tests/test_data_processing.py
new file mode 100644
index 0000000..e565113
--- /dev/null
+++ b/tests/test_data_processing.py
@@ -0,0 +1,48 @@
+"""Tests for data_processing."""
+
+import numpy as np
+import pandas as pd
+
+from eventdisplay_ml.data_processing import energy_interpolation_bins, zenith_in_bins
+
+
+def test_zenith_in_bins_numeric_edges_clips_and_handles_boundaries():
+    """Numeric bin edges should clip out-of-range values and place edge values consistently."""
+    zenith_angles = np.array([-5.0, 0.0, 9.9, 10.0, 19.9, 20.0, 42.0], dtype=float)
+    bins = [0.0, 10.0, 20.0, 30.0]
+
+    result = zenith_in_bins(zenith_angles, bins)
+
+    np.testing.assert_array_equal(result, np.array([0, 0, 0, 1, 1, 2, 2], dtype=np.int32))
+    assert result.dtype == np.int32
+
+
+def test_zenith_in_bins_dict_bins_matches_numeric_definition():
+    """Dict-style zenith bins should map to the same indices as explicit numeric edges."""
+    zenith_angles = np.array([-5.0, 0.0, 9.9, 10.0, 19.9, 20.0, 42.0], dtype=float)
+    dict_bins = [
+        {"Ze_min": 0.0, "Ze_max": 10.0},
+        {"Ze_min": 10.0, "Ze_max": 20.0},
+        {"Ze_min": 20.0, "Ze_max": 30.0},
+    ]
+
+    result = zenith_in_bins(zenith_angles, dict_bins)
+
+    np.testing.assert_array_equal(result, np.array([0, 0, 0, 1, 1, 2, 2], dtype=np.int32))
+    assert result.dtype == np.int32
+
+
+def test_energy_interpolation_bins_interpolates_and_clamps_with_invalid_events():
+    """Interpolation bins should handle interior, edge, and invalid energies robustly."""
+    df_chunk = pd.DataFrame({"Erec": [0.0, 0.1, 1.0, 10.0, 100.0]})
+    bins = [
+        {"E_min": -1.5, "E_max": -0.5},
+        None,
+        {"E_min": 0.5, "E_max": 1.5},
+    ]
+
+    e_bin_lo, e_bin_hi, e_alpha = energy_interpolation_bins(df_chunk, bins)
+
+    np.testing.assert_array_equal(e_bin_lo, np.array([-1, 0, 0, 0, 2], dtype=np.int32))
+    np.testing.assert_array_equal(e_bin_hi, np.array([-1, 0, 2, 2, 2], dtype=np.int32))
+    np.testing.assert_allclose(e_alpha, np.array([0.0, 0.0, 0.5, 1.0, 0.0], dtype=np.float32))

From 2eb6a86aef3c8cc7de6f79a5d5c2e39f2ad52a21 Mon Sep 17 00:00:00 2001
From: Gernot Maier <gernot.maier@desy.de>
Date: Thu, 14 May 2026 10:58:42 +0200
Subject: [PATCH 12/14] changelog

---
 docs/changes/58.feature.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 docs/changes/58.feature.md

diff --git a/docs/changes/58.feature.md b/docs/changes/58.feature.md
new file mode 100644
index 0000000..f345cfe
--- /dev/null
+++ b/docs/changes/58.feature.md
@@ -0,0 +1 @@
+Add TMVA-style gamma/hadron separation with identical features as TMVA BDT classification analysis.

From 0fe6e03108bdc4a1e4236ce3c0c7371c9c13334f Mon Sep 17 00:00:00 2001
From: Gernot Maier <gernot.maier@desy.de>
Date: Thu, 14 May 2026 11:14:01 +0200
Subject: [PATCH 13/14] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 docs/changes/58.feature.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/changes/58.feature.md b/docs/changes/58.feature.md
index f345cfe..c7f0210 100644
--- a/docs/changes/58.feature.md
+++ b/docs/changes/58.feature.md
@@ -1 +1 @@
-Add TMVA-style gamma/hadron separation with identical features as TMVA BDT classification analysis.
+Add TMVA-style gamma/hadron separation with the same features as TMVA BDT classification analysis.

From 3f5d12e3ebb3718ad6ccd68727662d98543623e8 Mon Sep 17 00:00:00 2001
From: Gernot Maier <gernot.maier@desy.de>
Date: Thu, 14 May 2026 11:14:12 +0200
Subject: [PATCH 14/14] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/eventdisplay_ml/data_processing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/eventdisplay_ml/data_processing.py b/src/eventdisplay_ml/data_processing.py
index 5e4cb21..a3801d2 100644
--- a/src/eventdisplay_ml/data_processing.py
+++ b/src/eventdisplay_ml/data_processing.py
@@ -1295,7 +1295,7 @@ def energy_in_bins(df_chunk, bins):
 def energy_interpolation_bins(df_chunk, bins):
     """Compute neighboring energy bins and interpolation weights per event.
 
-    Allows to interpolate downstream using 'value = (1 - alpha) * value_lo + alpha * value_hi'.
+    Allows downstream interpolation using 'value = (1 - alpha) * value_lo + alpha * value_hi'.
 
     Parameters
     ----------