Eventdisplay · GernotMaier · May 14, 2026 · May 11, 2026 · May 11, 2026 · May 11, 2026
diff --git a/docs/changes/58.feature.md b/docs/changes/58.feature.md
@@ -0,0 +1 @@
+Add TMVA-style gamma/hadron separation with the same features as TMVA BDT classification analysis.
diff --git a/src/eventdisplay_ml/config.py b/src/eventdisplay_ml/config.py
@@ -149,12 +149,23 @@ def configure_training(analysis_type):
         model_parameters = utils.load_model_parameters(
             model_configs["model_parameters"], model_configs["energy_bin_number"]
         )
+        tmva_style_raw = model_parameters.get("tmva_style", False)
+        if isinstance(tmva_style_raw, str):
+            model_configs["tmva_style"] = tmva_style_raw.strip().lower() in {
+                "1",
+                "true",
+                "yes",
+                "on",
+            }
+        else:
+            model_configs["tmva_style"] = bool(tmva_style_raw)
         model_configs["pre_cuts"] = pre_cuts_classification(
             e_min=np.power(10.0, model_parameters.get("energy_bins_log10_tev", []).get("E_min")),
             e_max=np.power(10.0, model_parameters.get("energy_bins_log10_tev", []).get("E_max")),
         )
         model_configs["energy_bins_log10_tev"] = model_parameters.get("energy_bins_log10_tev", [])
         model_configs["zenith_bins_deg"] = model_parameters.get("zenith_bins_deg", [])
+        _logger.info(f"TMVA-style classification: {model_configs['tmva_style']}")
 
     _logger.info(f"Pre-cuts: {model_configs['pre_cuts']}")
 

diff --git a/src/eventdisplay_ml/data_processing.py b/src/eventdisplay_ml/data_processing.py
@@ -51,7 +51,6 @@ def read_telescope_config(root_file):
 
     n_tel = int(telconfig_data["NTel"][0])
     tel_ids = telconfig_data["TelID"]
-    # Keep array of mirror areas; avoid shadowing this name later
     mirror_area_arr = telconfig_data["MirrorArea"]
     tel_x = telconfig_data["TelX"]
     tel_y = telconfig_data["TelY"]
@@ -130,7 +129,15 @@ def _resolve_branch_aliases(tree, branch_list):
     resolved = [b for b in resolved if b not in synthesized]
 
     # Drop missing optional branches
-    optional = {"fpointing_dx", "fpointing_dy", "E", "Erec", "ErecS", "nlowgain"}
+    optional = {
+        "fpointing_dx",
+        "fpointing_dy",
+        "E",
+        "Erec",
+        "ErecS",
+        "nlowgain",
+        "SizeSecondMax",
+    }
     final = [b for b in resolved if b not in optional or b in keys]
 
     return final, rename
@@ -797,7 +804,17 @@ def load_training_data(model_configs, file_list, analysis_type):
 
     input_files = utils.read_input_file_list(file_list)
 
-    branch_list = features_module.features(analysis_type, training=True)
+    tmva_style = model_configs.get("tmva_style", False)
+    if tmva_style and analysis_type == "classification":
+        _logger.info("Using TMVA-style features for classification")
+        branch_list = features_module.features_tmva_style(analysis_type, training=True)
+        # ze_bin is a derived feature and not present in ROOT input.
+        # Read elevation as auxiliary input to derive ze_bin.
+        branch_list = [b for b in branch_list if b not in {"ze_bin", "ArrayPointing_Azimuth"}] + [
+            "ArrayPointing_Elevation"
+        ]
+    else:
+        branch_list = features_module.features(analysis_type, training=True)
     _logger.info(f"Branch list: {branch_list}")
     if max_events is not None and max_events > 0:
         max_events_per_file = max_events // len(input_files)
@@ -860,17 +877,24 @@ def load_training_data(model_configs, file_list, analysis_type):
                     f" (fraction retained: {len(df) / n_before:.4f})"
                 )
 
-                df_flat = flatten_telescope_data_vectorized(
-                    df,
-                    tel_config["max_tel_id"] + 1,
-                    features_module.telescope_features(analysis_type),
-                    analysis_type,
-                    training=True,
-                    tel_config=tel_config,
-                    observatory=model_configs.get("observatory", "veritas"),
-                    max_tel_per_type=model_configs.get("max_tel_per_type", None),
-                    preview_rows=model_configs.get("preview_rows", 20),
-                )
+                # For TMVA-style classification, skip telescope flattening (use event-level features only)
+                if tmva_style and analysis_type == "classification":
+                    _logger.info("Converting to pandas (no telescope flattening for TMVA style)")
+                    # Build DataFrame directly to stay compatible across awkward versions
+                    # (some versions do not provide ak.to_pandas).
+                    df_flat = pd.DataFrame({name: _to_numpy_1d(df[name]) for name in df.fields})
+                else:
+                    df_flat = flatten_telescope_data_vectorized(
+                        df,
+                        tel_config["max_tel_id"] + 1,
+                        features_module.telescope_features(analysis_type),
+                        analysis_type,
+                        training=True,
+                        tel_config=tel_config,
+                        observatory=model_configs.get("observatory", "veritas"),
+                        max_tel_per_type=model_configs.get("max_tel_per_type", None),
+                        preview_rows=model_configs.get("preview_rows", 20),
+                    )
 
                 # Filter out events with invalid energy reconstruction for stereo training
                 if analysis_type == "stereo_analysis":
@@ -912,6 +936,14 @@ def load_training_data(model_configs, file_list, analysis_type):
                 for col_name, values in new_cols.items():
                     df_flat[col_name] = values
 
+                # For TMVA-style classification, keep pointing only as an intermediate
+                # to compute ze_bin, but do not expose raw pointing as ML features.
+                if tmva_style and analysis_type == "classification":
+                    df_flat = df_flat.drop(
+                        columns=["ArrayPointing_Elevation", "ArrayPointing_Azimuth"],
+                        errors="ignore",
+                    )
+
                 # Filter out events with NaN in residuals (can't train on these)
                 if analysis_type == "stereo_analysis":
                     n_before_nan_filter = len(df_flat)
@@ -1189,13 +1221,27 @@ def extra_columns(df, analysis_type, training, index, tel_config=None, observato
             tel_list_matrix = _to_dense_array(df["DispTelList_T"])
             data["array_footprint"] = _calculate_array_footprint(tel_config, tel_list_matrix)
     elif "classification" in analysis_type:
+        n = len(index)
         data = {
             "MSCW": _to_numpy_1d(df["MSCW"], np.float32),
             "MSCL": _to_numpy_1d(df["MSCL"], np.float32),
             "EChi2S": _to_numpy_1d(df["EChi2S"], np.float32),
             "EmissionHeight": _to_numpy_1d(df["EmissionHeight"], np.float32),
             "EmissionHeightChi2": _to_numpy_1d(df["EmissionHeightChi2"], np.float32),
         }
+        if _has_field(df, "SizeSecondMax"):
+            data["SizeSecondMax"] = _to_numpy_1d(df["SizeSecondMax"], np.float32)
+        # Add core distance: sqrt(Xcore^2 + Ycore^2)
+        if _has_field(df, "Xcore") and _has_field(df, "Ycore"):
+            xcore = _to_numpy_1d(df["Xcore"], np.float32)
+            ycore = _to_numpy_1d(df["Ycore"], np.float32)
+            data["Core_Distance"] = np.sqrt(xcore**2 + ycore**2).astype(np.float32)
+        else:
+            data["Core_Distance"] = np.full(n, DEFAULT_FILL_VALUE, dtype=np.float32)
+        if _has_field(df, "DispAbsSumWeigth"):
+            data["DispAbsSumWeigth"] = _to_numpy_1d(df["DispAbsSumWeigth"], np.float32)
+        else:
+            data["DispAbsSumWeigth"] = np.full(n, DEFAULT_FILL_VALUE, dtype=np.float32)
         if not training:
             data["ze_bin"] = _to_numpy_1d(df["ze_bin"], np.float32)
 
@@ -1214,6 +1260,7 @@ def extra_columns(df, analysis_type, training, index, tel_config=None, observato
             "EmissionHeightChi2",
             "Erec",
             "ErecS",
+            "SizeSecondMax",
         ]
     apply_clip_intervals(
         df_extra,
@@ -1248,6 +1295,8 @@ def energy_in_bins(df_chunk, bins):
 def energy_interpolation_bins(df_chunk, bins):
     """Compute neighboring energy bins and interpolation weights per event.
 
+    Allows downstream interpolation using 'value = (1 - alpha) * value_lo + alpha * value_hi'.
+
     Parameters
     ----------
     df_chunk : pandas.DataFrame

diff --git a/src/eventdisplay_ml/evaluate.py b/src/eventdisplay_ml/evaluate.py
@@ -36,13 +36,16 @@ def evaluation_efficiency(name, model, x_test, y_test):
             f"Background Efficiency: {eff_background[-1]:.4f}"
         )
 
+    eff_signal = np.asarray(eff_signal, dtype=float)
+    eff_background = np.asarray(eff_background, dtype=float)
+
     return pd.DataFrame(
         {
             "threshold": thresholds,
             "signal_efficiency": eff_signal,
             "background_efficiency": eff_background,
-            "n_signal": n_signal,
-            "n_background": n_background,
+            "n_signal": n_signal * eff_signal,
+            "n_background": n_background * eff_background,
         }
     )
 

diff --git a/src/eventdisplay_ml/features.py b/src/eventdisplay_ml/features.py
@@ -1,6 +1,31 @@
 """Features used for XGB training and prediction."""
 
 
+def features_tmva_style(analysis_type, training=True):
+    """
+    Get TMVA-style features for classification analysis.
+
+    Parameters
+    ----------
+    analysis_type : str
+        Type of analysis.
+    training : bool, optional
+        If True (default), return features including target features.
+        If False, return only non-target features (i.e. features used
+        for prediction).
+
+    Returns
+    -------
+    list
+        List of feature names.
+    """
+    if analysis_type == "stereo_analysis":
+        raise ValueError("TMVA-style features are only defined for classification analysis.")
+    if "classification" in analysis_type:
+        return _classification_features(tmva_style=True)
+    raise ValueError(f"Unknown analysis type: {analysis_type}")
+
+
 def target_features(analysis_type):
     """
     Get target features based on analysis type.
@@ -67,7 +92,7 @@ def telescope_features(analysis_type):
     Parameters
     ----------
     analysis_type : str
-        Type of analysis, e.g. ``"classification"`` or ``"stereo_analysis"``.
+        Type of analysis.
 
     Returns
     -------
@@ -136,8 +161,36 @@ def _regression_features(training):
     return var
 
 
-def _classification_features():
-    """Classification features."""
+def _classification_features(tmva_style=False):
+    """
+    Classification features.
+
+    Parameters
+    ----------
+    tmva_style : bool, optional
+        If True, return features matching TMVA BDT input (default: False).
+
+    Returns
+    -------
+    list
+        List of feature names.
+    """
+    if tmva_style:
+        # Base features from ROOT file needed to compute derived features
+        return [
+            "DispNImages",
+            "EChi2S",
+            "EmissionHeight",
+            "EmissionHeightChi2",
+            "MSCW",
+            "MSCL",
+            "SizeSecondMax",
+            "Xcore",
+            "Ycore",
+            "DispAbsSumWeigth",
+            "ArrayPointing_Elevation",
+        ]
+
     var_tel = telescope_features("classification")
     var_array = [
         "DispNImages",
@@ -162,6 +215,9 @@ def clip_intervals():
     """
     Get clip intervals for variables.
 
+    Clip intervals are defined based on physical bounds or ranges used
+    in the classical TMVA BDT analysis.
+
     Returns
     -------
     dict
@@ -183,8 +239,8 @@ def clip_intervals():
         # Energy-related variables - log10 transformation with lower bound
         "Erec": (energy_min, None),
         "ErecS": (energy_min, None),
-        "EChi2S": (energy_min, None),
-        "EmissionHeightChi2": (1e-6, None),
+        "EChi2S": (energy_min, 10000.0),  # TMVA: -6 to 4 (before log10)
+        "EmissionHeightChi2": (1e-6, 10000.0),  # TMVA: -11 to 4 (before log10)
         "img2_ang": (0.0, 360.0),
         # Per-telescope energy and size variables - log10 transformation with lower bound
         "size": (1, None),
@@ -195,8 +251,13 @@ def clip_intervals():
         # Derived variables - avoid numerical issues
         "size_dist2": (1.0, None),
         "tgrad_x": (-50.0, 50.0),
+        "MSCW": (-2.0, 2.0),
+        "MSCL": (-2.0, 5.0),
+        "SizeSecondMax": (1e-6, 100000.0),  # TMVA: 0 to 5 (after log10)
+        "Core_Distance": (0.0, 1000.0),  # sqrt(Xcore^2 + Ycore^2)
+        "DispAbsSumWeigth": (0.0, 5.0),  # TMVA: 0 to 5
         # Physical bounds
-        "EmissionHeight": (0, 120),  # top of atmosphere
+        "EmissionHeight": (0, 100),  # top of atmosphere
         "R_core": (-10, None),  # badly reconstructed events
     }
 

diff --git a/tests/test_classification_apply_interpolation.py b/tests/test_classification_apply_interpolation.py
@@ -70,3 +70,25 @@ def test_apply_classification_models_interpolates_probabilities_and_thresholds(m
 
     # Thresholds are interpolated the same way: [0.5, 0.7]
     np.testing.assert_array_equal(is_gamma[50], np.array([0, 1], dtype=np.uint8))
+
+
+def test_extra_columns_skip_tmva_only_size_second_max_when_branch_missing():
+    """Standard classification should not synthesize the TMVA-only SizeSecondMax column."""
+    df = pd.DataFrame(
+        {
+            "MSCW": [0.1, 0.2],
+            "MSCL": [0.3, 0.4],
+            "EChi2S": [1.0, 2.0],
+            "EmissionHeight": [10.0, 20.0],
+            "EmissionHeightChi2": [0.5, 0.25],
+        }
+    )
+
+    result = data_processing.extra_columns(
+        df,
+        analysis_type="classification",
+        training=True,
+        index=pd.RangeIndex(2),
+    )
+
+    assert "SizeSecondMax" not in result.columns
diff --git a/tests/test_data_processing.py b/tests/test_data_processing.py
@@ -0,0 +1,48 @@
+"""Tests for data_processing."""
+
+import numpy as np
+import pandas as pd
+
+from eventdisplay_ml.data_processing import energy_interpolation_bins, zenith_in_bins
+
+
+def test_zenith_in_bins_numeric_edges_clips_and_handles_boundaries():
+    """Numeric bin edges should clip out-of-range values and place edge values consistently."""
+    zenith_angles = np.array([-5.0, 0.0, 9.9, 10.0, 19.9, 20.0, 42.0], dtype=float)
+    bins = [0.0, 10.0, 20.0, 30.0]
+
+    result = zenith_in_bins(zenith_angles, bins)
+
+    np.testing.assert_array_equal(result, np.array([0, 0, 0, 1, 1, 2, 2], dtype=np.int32))
+    assert result.dtype == np.int32
+
+
+def test_zenith_in_bins_dict_bins_matches_numeric_definition():
+    """Dict-style zenith bins should map to the same indices as explicit numeric edges."""
+    zenith_angles = np.array([-5.0, 0.0, 9.9, 10.0, 19.9, 20.0, 42.0], dtype=float)
+    dict_bins = [
+        {"Ze_min": 0.0, "Ze_max": 10.0},
+        {"Ze_min": 10.0, "Ze_max": 20.0},
+        {"Ze_min": 20.0, "Ze_max": 30.0},
+    ]
+
+    result = zenith_in_bins(zenith_angles, dict_bins)
+
+    np.testing.assert_array_equal(result, np.array([0, 0, 0, 1, 1, 2, 2], dtype=np.int32))
+    assert result.dtype == np.int32
+
+
+def test_energy_interpolation_bins_interpolates_and_clamps_with_invalid_events():
+    """Interpolation bins should handle interior, edge, and invalid energies robustly."""
+    df_chunk = pd.DataFrame({"Erec": [0.0, 0.1, 1.0, 10.0, 100.0]})
+    bins = [
+        {"E_min": -1.5, "E_max": -0.5},
+        None,
+        {"E_min": 0.5, "E_max": 1.5},
+    ]
+
+    e_bin_lo, e_bin_hi, e_alpha = energy_interpolation_bins(df_chunk, bins)
+
+    np.testing.assert_array_equal(e_bin_lo, np.array([-1, 0, 0, 0, 2], dtype=np.int32))
+    np.testing.assert_array_equal(e_bin_hi, np.array([-1, 0, 2, 2, 2], dtype=np.int32))
+    np.testing.assert_allclose(e_alpha, np.array([0.0, 0.0, 0.5, 1.0, 0.0], dtype=np.float32))
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Add TMVA-style gamma/hadron separation with the same features as TMVA BDT classification analysis.