Skip to content
1 change: 1 addition & 0 deletions docs/changes/58.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add TMVA-style gamma/hadron separation with the same features as TMVA BDT classification analysis.
11 changes: 11 additions & 0 deletions src/eventdisplay_ml/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,12 +149,23 @@ def configure_training(analysis_type):
model_parameters = utils.load_model_parameters(
model_configs["model_parameters"], model_configs["energy_bin_number"]
)
tmva_style_raw = model_parameters.get("tmva_style", False)
if isinstance(tmva_style_raw, str):
model_configs["tmva_style"] = tmva_style_raw.strip().lower() in {
"1",
"true",
"yes",
"on",
}
else:
model_configs["tmva_style"] = bool(tmva_style_raw)
model_configs["pre_cuts"] = pre_cuts_classification(
e_min=np.power(10.0, model_parameters.get("energy_bins_log10_tev", []).get("E_min")),
e_max=np.power(10.0, model_parameters.get("energy_bins_log10_tev", []).get("E_max")),
)
model_configs["energy_bins_log10_tev"] = model_parameters.get("energy_bins_log10_tev", [])
model_configs["zenith_bins_deg"] = model_parameters.get("zenith_bins_deg", [])
_logger.info(f"TMVA-style classification: {model_configs['tmva_style']}")

_logger.info(f"Pre-cuts: {model_configs['pre_cuts']}")

Expand Down
77 changes: 63 additions & 14 deletions src/eventdisplay_ml/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def read_telescope_config(root_file):

n_tel = int(telconfig_data["NTel"][0])
tel_ids = telconfig_data["TelID"]
# Keep array of mirror areas; avoid shadowing this name later
mirror_area_arr = telconfig_data["MirrorArea"]
tel_x = telconfig_data["TelX"]
tel_y = telconfig_data["TelY"]
Expand Down Expand Up @@ -130,7 +129,15 @@ def _resolve_branch_aliases(tree, branch_list):
resolved = [b for b in resolved if b not in synthesized]

# Drop missing optional branches
optional = {"fpointing_dx", "fpointing_dy", "E", "Erec", "ErecS", "nlowgain"}
optional = {
"fpointing_dx",
"fpointing_dy",
"E",
"Erec",
"ErecS",
"nlowgain",
"SizeSecondMax",
}
Comment on lines +139 to +140
final = [b for b in resolved if b not in optional or b in keys]

return final, rename
Expand Down Expand Up @@ -797,7 +804,17 @@ def load_training_data(model_configs, file_list, analysis_type):

input_files = utils.read_input_file_list(file_list)

branch_list = features_module.features(analysis_type, training=True)
tmva_style = model_configs.get("tmva_style", False)
if tmva_style and analysis_type == "classification":
_logger.info("Using TMVA-style features for classification")
branch_list = features_module.features_tmva_style(analysis_type, training=True)
# ze_bin is a derived feature and not present in ROOT input.
# Read elevation as auxiliary input to derive ze_bin.
branch_list = [b for b in branch_list if b not in {"ze_bin", "ArrayPointing_Azimuth"}] + [
"ArrayPointing_Elevation"
]
else:
branch_list = features_module.features(analysis_type, training=True)
_logger.info(f"Branch list: {branch_list}")
if max_events is not None and max_events > 0:
max_events_per_file = max_events // len(input_files)
Expand Down Expand Up @@ -860,17 +877,24 @@ def load_training_data(model_configs, file_list, analysis_type):
f" (fraction retained: {len(df) / n_before:.4f})"
)

df_flat = flatten_telescope_data_vectorized(
df,
tel_config["max_tel_id"] + 1,
features_module.telescope_features(analysis_type),
analysis_type,
training=True,
tel_config=tel_config,
observatory=model_configs.get("observatory", "veritas"),
max_tel_per_type=model_configs.get("max_tel_per_type", None),
preview_rows=model_configs.get("preview_rows", 20),
)
# For TMVA-style classification, skip telescope flattening (use event-level features only)
if tmva_style and analysis_type == "classification":
_logger.info("Converting to pandas (no telescope flattening for TMVA style)")
# Build DataFrame directly to stay compatible across awkward versions
# (some versions do not provide ak.to_pandas).
df_flat = pd.DataFrame({name: _to_numpy_1d(df[name]) for name in df.fields})
else:
df_flat = flatten_telescope_data_vectorized(
df,
tel_config["max_tel_id"] + 1,
features_module.telescope_features(analysis_type),
analysis_type,
training=True,
tel_config=tel_config,
observatory=model_configs.get("observatory", "veritas"),
max_tel_per_type=model_configs.get("max_tel_per_type", None),
preview_rows=model_configs.get("preview_rows", 20),
)

# Filter out events with invalid energy reconstruction for stereo training
if analysis_type == "stereo_analysis":
Expand Down Expand Up @@ -912,6 +936,14 @@ def load_training_data(model_configs, file_list, analysis_type):
for col_name, values in new_cols.items():
df_flat[col_name] = values

# For TMVA-style classification, keep pointing only as an intermediate
# to compute ze_bin, but do not expose raw pointing as ML features.
if tmva_style and analysis_type == "classification":
df_flat = df_flat.drop(
columns=["ArrayPointing_Elevation", "ArrayPointing_Azimuth"],
errors="ignore",
)

# Filter out events with NaN in residuals (can't train on these)
if analysis_type == "stereo_analysis":
n_before_nan_filter = len(df_flat)
Expand Down Expand Up @@ -1189,13 +1221,27 @@ def extra_columns(df, analysis_type, training, index, tel_config=None, observato
tel_list_matrix = _to_dense_array(df["DispTelList_T"])
data["array_footprint"] = _calculate_array_footprint(tel_config, tel_list_matrix)
elif "classification" in analysis_type:
n = len(index)
data = {
"MSCW": _to_numpy_1d(df["MSCW"], np.float32),
"MSCL": _to_numpy_1d(df["MSCL"], np.float32),
"EChi2S": _to_numpy_1d(df["EChi2S"], np.float32),
"EmissionHeight": _to_numpy_1d(df["EmissionHeight"], np.float32),
"EmissionHeightChi2": _to_numpy_1d(df["EmissionHeightChi2"], np.float32),
}
if _has_field(df, "SizeSecondMax"):
data["SizeSecondMax"] = _to_numpy_1d(df["SizeSecondMax"], np.float32)
# Add core distance: sqrt(Xcore^2 + Ycore^2)
if _has_field(df, "Xcore") and _has_field(df, "Ycore"):
xcore = _to_numpy_1d(df["Xcore"], np.float32)
ycore = _to_numpy_1d(df["Ycore"], np.float32)
data["Core_Distance"] = np.sqrt(xcore**2 + ycore**2).astype(np.float32)
else:
data["Core_Distance"] = np.full(n, DEFAULT_FILL_VALUE, dtype=np.float32)
if _has_field(df, "DispAbsSumWeigth"):
data["DispAbsSumWeigth"] = _to_numpy_1d(df["DispAbsSumWeigth"], np.float32)
else:
data["DispAbsSumWeigth"] = np.full(n, DEFAULT_FILL_VALUE, dtype=np.float32)
if not training:
data["ze_bin"] = _to_numpy_1d(df["ze_bin"], np.float32)

Expand All @@ -1214,6 +1260,7 @@ def extra_columns(df, analysis_type, training, index, tel_config=None, observato
"EmissionHeightChi2",
"Erec",
"ErecS",
"SizeSecondMax",
]
apply_clip_intervals(
df_extra,
Expand Down Expand Up @@ -1248,6 +1295,8 @@ def energy_in_bins(df_chunk, bins):
def energy_interpolation_bins(df_chunk, bins):
"""Compute neighboring energy bins and interpolation weights per event.

Allows downstream interpolation using 'value = (1 - alpha) * value_lo + alpha * value_hi'.

Parameters
----------
df_chunk : pandas.DataFrame
Expand Down
7 changes: 5 additions & 2 deletions src/eventdisplay_ml/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,16 @@ def evaluation_efficiency(name, model, x_test, y_test):
f"Background Efficiency: {eff_background[-1]:.4f}"
)

eff_signal = np.asarray(eff_signal, dtype=float)
eff_background = np.asarray(eff_background, dtype=float)

return pd.DataFrame(
{
"threshold": thresholds,
"signal_efficiency": eff_signal,
"background_efficiency": eff_background,
"n_signal": n_signal,
"n_background": n_background,
"n_signal": n_signal * eff_signal,
"n_background": n_background * eff_background,
}
)

Expand Down
73 changes: 67 additions & 6 deletions src/eventdisplay_ml/features.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,31 @@
"""Features used for XGB training and prediction."""


def features_tmva_style(analysis_type, training=True):
"""
Get TMVA-style features for classification analysis.

Parameters
----------
analysis_type : str
Type of analysis.
training : bool, optional
If True (default), return features including target features.
If False, return only non-target features (i.e. features used
for prediction).

Returns
-------
list
List of feature names.
"""
if analysis_type == "stereo_analysis":
raise ValueError("TMVA-style features are only defined for classification analysis.")
if "classification" in analysis_type:
return _classification_features(tmva_style=True)
raise ValueError(f"Unknown analysis type: {analysis_type}")


def target_features(analysis_type):
"""
Get target features based on analysis type.
Expand Down Expand Up @@ -67,7 +92,7 @@ def telescope_features(analysis_type):
Parameters
----------
analysis_type : str
Type of analysis, e.g. ``"classification"`` or ``"stereo_analysis"``.
Type of analysis.

Returns
-------
Expand Down Expand Up @@ -136,8 +161,36 @@ def _regression_features(training):
return var


def _classification_features():
"""Classification features."""
def _classification_features(tmva_style=False):
"""
Classification features.

Parameters
----------
tmva_style : bool, optional
If True, return features matching TMVA BDT input (default: False).

Returns
-------
list
List of feature names.
"""
if tmva_style:
# Base features from ROOT file needed to compute derived features
return [
"DispNImages",
"EChi2S",
"EmissionHeight",
"EmissionHeightChi2",
"MSCW",
"MSCL",
"SizeSecondMax",
"Xcore",
"Ycore",
"DispAbsSumWeigth",
"ArrayPointing_Elevation",
]

var_tel = telescope_features("classification")
var_array = [
"DispNImages",
Expand All @@ -162,6 +215,9 @@ def clip_intervals():
"""
Get clip intervals for variables.

Clip intervals are defined based on physical bounds or ranges used
in the classical TMVA BDT analysis.

Returns
-------
dict
Expand All @@ -183,8 +239,8 @@ def clip_intervals():
# Energy-related variables - log10 transformation with lower bound
"Erec": (energy_min, None),
"ErecS": (energy_min, None),
"EChi2S": (energy_min, None),
"EmissionHeightChi2": (1e-6, None),
"EChi2S": (energy_min, 10000.0), # TMVA: -6 to 4 (before log10)
"EmissionHeightChi2": (1e-6, 10000.0), # TMVA: -11 to 4 (before log10)
Comment on lines +242 to +243
"img2_ang": (0.0, 360.0),
# Per-telescope energy and size variables - log10 transformation with lower bound
"size": (1, None),
Expand All @@ -195,8 +251,13 @@ def clip_intervals():
# Derived variables - avoid numerical issues
"size_dist2": (1.0, None),
"tgrad_x": (-50.0, 50.0),
"MSCW": (-2.0, 2.0),
"MSCL": (-2.0, 5.0),
"SizeSecondMax": (1e-6, 100000.0), # TMVA: 0 to 5 (after log10)
"Core_Distance": (0.0, 1000.0), # sqrt(Xcore^2 + Ycore^2)
"DispAbsSumWeigth": (0.0, 5.0), # TMVA: 0 to 5
# Physical bounds
"EmissionHeight": (0, 120), # top of atmosphere
"EmissionHeight": (0, 100), # top of atmosphere
"R_core": (-10, None), # badly reconstructed events
}

Expand Down
22 changes: 22 additions & 0 deletions tests/test_classification_apply_interpolation.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,25 @@ def test_apply_classification_models_interpolates_probabilities_and_thresholds(m

# Thresholds are interpolated the same way: [0.5, 0.7]
np.testing.assert_array_equal(is_gamma[50], np.array([0, 1], dtype=np.uint8))


def test_extra_columns_skip_tmva_only_size_second_max_when_branch_missing():
"""Standard classification should not synthesize the TMVA-only SizeSecondMax column."""
df = pd.DataFrame(
{
"MSCW": [0.1, 0.2],
"MSCL": [0.3, 0.4],
"EChi2S": [1.0, 2.0],
"EmissionHeight": [10.0, 20.0],
"EmissionHeightChi2": [0.5, 0.25],
}
)

result = data_processing.extra_columns(
df,
analysis_type="classification",
training=True,
index=pd.RangeIndex(2),
)

assert "SizeSecondMax" not in result.columns
48 changes: 48 additions & 0 deletions tests/test_data_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Tests for data_processing."""

import numpy as np
import pandas as pd

from eventdisplay_ml.data_processing import energy_interpolation_bins, zenith_in_bins


def test_zenith_in_bins_numeric_edges_clips_and_handles_boundaries():
"""Numeric bin edges should clip out-of-range values and place edge values consistently."""
zenith_angles = np.array([-5.0, 0.0, 9.9, 10.0, 19.9, 20.0, 42.0], dtype=float)
bins = [0.0, 10.0, 20.0, 30.0]

result = zenith_in_bins(zenith_angles, bins)

np.testing.assert_array_equal(result, np.array([0, 0, 0, 1, 1, 2, 2], dtype=np.int32))
assert result.dtype == np.int32


def test_zenith_in_bins_dict_bins_matches_numeric_definition():
"""Dict-style zenith bins should map to the same indices as explicit numeric edges."""
zenith_angles = np.array([-5.0, 0.0, 9.9, 10.0, 19.9, 20.0, 42.0], dtype=float)
dict_bins = [
{"Ze_min": 0.0, "Ze_max": 10.0},
{"Ze_min": 10.0, "Ze_max": 20.0},
{"Ze_min": 20.0, "Ze_max": 30.0},
]

result = zenith_in_bins(zenith_angles, dict_bins)

np.testing.assert_array_equal(result, np.array([0, 0, 0, 1, 1, 2, 2], dtype=np.int32))
assert result.dtype == np.int32


def test_energy_interpolation_bins_interpolates_and_clamps_with_invalid_events():
"""Interpolation bins should handle interior, edge, and invalid energies robustly."""
df_chunk = pd.DataFrame({"Erec": [0.0, 0.1, 1.0, 10.0, 100.0]})
bins = [
{"E_min": -1.5, "E_max": -0.5},
None,
{"E_min": 0.5, "E_max": 1.5},
]

e_bin_lo, e_bin_hi, e_alpha = energy_interpolation_bins(df_chunk, bins)

np.testing.assert_array_equal(e_bin_lo, np.array([-1, 0, 0, 0, 2], dtype=np.int32))
np.testing.assert_array_equal(e_bin_hi, np.array([-1, 0, 2, 2, 2], dtype=np.int32))
np.testing.assert_allclose(e_alpha, np.array([0.0, 0.0, 0.5, 1.0, 0.0], dtype=np.float32))