diff --git a/docs/changes/58.feature.md b/docs/changes/58.feature.md new file mode 100644 index 0000000..c7f0210 --- /dev/null +++ b/docs/changes/58.feature.md @@ -0,0 +1 @@ +Add TMVA-style gamma/hadron separation with the same features as TMVA BDT classification analysis. diff --git a/src/eventdisplay_ml/config.py b/src/eventdisplay_ml/config.py index 860bbca..07b47db 100644 --- a/src/eventdisplay_ml/config.py +++ b/src/eventdisplay_ml/config.py @@ -149,12 +149,23 @@ def configure_training(analysis_type): model_parameters = utils.load_model_parameters( model_configs["model_parameters"], model_configs["energy_bin_number"] ) + tmva_style_raw = model_parameters.get("tmva_style", False) + if isinstance(tmva_style_raw, str): + model_configs["tmva_style"] = tmva_style_raw.strip().lower() in { + "1", + "true", + "yes", + "on", + } + else: + model_configs["tmva_style"] = bool(tmva_style_raw) model_configs["pre_cuts"] = pre_cuts_classification( e_min=np.power(10.0, model_parameters.get("energy_bins_log10_tev", []).get("E_min")), e_max=np.power(10.0, model_parameters.get("energy_bins_log10_tev", []).get("E_max")), ) model_configs["energy_bins_log10_tev"] = model_parameters.get("energy_bins_log10_tev", []) model_configs["zenith_bins_deg"] = model_parameters.get("zenith_bins_deg", []) + _logger.info(f"TMVA-style classification: {model_configs['tmva_style']}") _logger.info(f"Pre-cuts: {model_configs['pre_cuts']}") diff --git a/src/eventdisplay_ml/data_processing.py b/src/eventdisplay_ml/data_processing.py index b9fa59b..a3801d2 100644 --- a/src/eventdisplay_ml/data_processing.py +++ b/src/eventdisplay_ml/data_processing.py @@ -51,7 +51,6 @@ def read_telescope_config(root_file): n_tel = int(telconfig_data["NTel"][0]) tel_ids = telconfig_data["TelID"] - # Keep array of mirror areas; avoid shadowing this name later mirror_area_arr = telconfig_data["MirrorArea"] tel_x = telconfig_data["TelX"] tel_y = telconfig_data["TelY"] @@ -130,7 +129,15 @@ def _resolve_branch_aliases(tree, branch_list): resolved = [b for b in resolved if b not in synthesized] # Drop missing optional branches - optional = {"fpointing_dx", "fpointing_dy", "E", "Erec", "ErecS", "nlowgain"} + optional = { + "fpointing_dx", + "fpointing_dy", + "E", + "Erec", + "ErecS", + "nlowgain", + "SizeSecondMax", + } final = [b for b in resolved if b not in optional or b in keys] return final, rename @@ -797,7 +804,17 @@ def load_training_data(model_configs, file_list, analysis_type): input_files = utils.read_input_file_list(file_list) - branch_list = features_module.features(analysis_type, training=True) + tmva_style = model_configs.get("tmva_style", False) + if tmva_style and analysis_type == "classification": + _logger.info("Using TMVA-style features for classification") + branch_list = features_module.features_tmva_style(analysis_type, training=True) + # ze_bin is a derived feature and not present in ROOT input. + # Read elevation as auxiliary input to derive ze_bin. + branch_list = [b for b in branch_list if b not in {"ze_bin", "ArrayPointing_Azimuth"}] + [ + "ArrayPointing_Elevation" + ] + else: + branch_list = features_module.features(analysis_type, training=True) _logger.info(f"Branch list: {branch_list}") if max_events is not None and max_events > 0: max_events_per_file = max_events // len(input_files) @@ -860,17 +877,24 @@ def load_training_data(model_configs, file_list, analysis_type): f" (fraction retained: {len(df) / n_before:.4f})" ) - df_flat = flatten_telescope_data_vectorized( - df, - tel_config["max_tel_id"] + 1, - features_module.telescope_features(analysis_type), - analysis_type, - training=True, - tel_config=tel_config, - observatory=model_configs.get("observatory", "veritas"), - max_tel_per_type=model_configs.get("max_tel_per_type", None), - preview_rows=model_configs.get("preview_rows", 20), - ) + # For TMVA-style classification, skip telescope flattening (use event-level features only) + if tmva_style and analysis_type == "classification": + _logger.info("Converting to pandas (no telescope flattening for TMVA style)") + # Build DataFrame directly to stay compatible across awkward versions + # (some versions do not provide ak.to_pandas). + df_flat = pd.DataFrame({name: _to_numpy_1d(df[name]) for name in df.fields}) + else: + df_flat = flatten_telescope_data_vectorized( + df, + tel_config["max_tel_id"] + 1, + features_module.telescope_features(analysis_type), + analysis_type, + training=True, + tel_config=tel_config, + observatory=model_configs.get("observatory", "veritas"), + max_tel_per_type=model_configs.get("max_tel_per_type", None), + preview_rows=model_configs.get("preview_rows", 20), + ) # Filter out events with invalid energy reconstruction for stereo training if analysis_type == "stereo_analysis": @@ -912,6 +936,14 @@ def load_training_data(model_configs, file_list, analysis_type): for col_name, values in new_cols.items(): df_flat[col_name] = values + # For TMVA-style classification, keep pointing only as an intermediate + # to compute ze_bin, but do not expose raw pointing as ML features. + if tmva_style and analysis_type == "classification": + df_flat = df_flat.drop( + columns=["ArrayPointing_Elevation", "ArrayPointing_Azimuth"], + errors="ignore", + ) + # Filter out events with NaN in residuals (can't train on these) if analysis_type == "stereo_analysis": n_before_nan_filter = len(df_flat) @@ -1189,6 +1221,7 @@ def extra_columns(df, analysis_type, training, index, tel_config=None, observato tel_list_matrix = _to_dense_array(df["DispTelList_T"]) data["array_footprint"] = _calculate_array_footprint(tel_config, tel_list_matrix) elif "classification" in analysis_type: + n = len(index) data = { "MSCW": _to_numpy_1d(df["MSCW"], np.float32), "MSCL": _to_numpy_1d(df["MSCL"], np.float32), @@ -1196,6 +1229,19 @@ def extra_columns(df, analysis_type, training, index, tel_config=None, observato "EmissionHeight": _to_numpy_1d(df["EmissionHeight"], np.float32), "EmissionHeightChi2": _to_numpy_1d(df["EmissionHeightChi2"], np.float32), } + if _has_field(df, "SizeSecondMax"): + data["SizeSecondMax"] = _to_numpy_1d(df["SizeSecondMax"], np.float32) + # Add core distance: sqrt(Xcore^2 + Ycore^2) + if _has_field(df, "Xcore") and _has_field(df, "Ycore"): + xcore = _to_numpy_1d(df["Xcore"], np.float32) + ycore = _to_numpy_1d(df["Ycore"], np.float32) + data["Core_Distance"] = np.sqrt(xcore**2 + ycore**2).astype(np.float32) + else: + data["Core_Distance"] = np.full(n, DEFAULT_FILL_VALUE, dtype=np.float32) + if _has_field(df, "DispAbsSumWeigth"): + data["DispAbsSumWeigth"] = _to_numpy_1d(df["DispAbsSumWeigth"], np.float32) + else: + data["DispAbsSumWeigth"] = np.full(n, DEFAULT_FILL_VALUE, dtype=np.float32) if not training: data["ze_bin"] = _to_numpy_1d(df["ze_bin"], np.float32) @@ -1214,6 +1260,7 @@ def extra_columns(df, analysis_type, training, index, tel_config=None, observato "EmissionHeightChi2", "Erec", "ErecS", + "SizeSecondMax", ] apply_clip_intervals( df_extra, @@ -1248,6 +1295,8 @@ def energy_in_bins(df_chunk, bins): def energy_interpolation_bins(df_chunk, bins): """Compute neighboring energy bins and interpolation weights per event. + Allows downstream interpolation using 'value = (1 - alpha) * value_lo + alpha * value_hi'. + Parameters ---------- df_chunk : pandas.DataFrame diff --git a/src/eventdisplay_ml/evaluate.py b/src/eventdisplay_ml/evaluate.py index 54167d8..fe93b17 100644 --- a/src/eventdisplay_ml/evaluate.py +++ b/src/eventdisplay_ml/evaluate.py @@ -36,13 +36,16 @@ def evaluation_efficiency(name, model, x_test, y_test): f"Background Efficiency: {eff_background[-1]:.4f}" ) + eff_signal = np.asarray(eff_signal, dtype=float) + eff_background = np.asarray(eff_background, dtype=float) + return pd.DataFrame( { "threshold": thresholds, "signal_efficiency": eff_signal, "background_efficiency": eff_background, - "n_signal": n_signal, - "n_background": n_background, + "n_signal": n_signal * eff_signal, + "n_background": n_background * eff_background, } ) diff --git a/src/eventdisplay_ml/features.py b/src/eventdisplay_ml/features.py index b4818d7..d4b277d 100644 --- a/src/eventdisplay_ml/features.py +++ b/src/eventdisplay_ml/features.py @@ -1,6 +1,31 @@ """Features used for XGB training and prediction.""" +def features_tmva_style(analysis_type, training=True): + """ + Get TMVA-style features for classification analysis. + + Parameters + ---------- + analysis_type : str + Type of analysis. + training : bool, optional + If True (default), return features including target features. + If False, return only non-target features (i.e. features used + for prediction). + + Returns + ------- + list + List of feature names. + """ + if analysis_type == "stereo_analysis": + raise ValueError("TMVA-style features are only defined for classification analysis.") + if "classification" in analysis_type: + return _classification_features(tmva_style=True) + raise ValueError(f"Unknown analysis type: {analysis_type}") + + def target_features(analysis_type): """ Get target features based on analysis type. @@ -67,7 +92,7 @@ def telescope_features(analysis_type): Parameters ---------- analysis_type : str - Type of analysis, e.g. ``"classification"`` or ``"stereo_analysis"``. + Type of analysis. Returns ------- @@ -136,8 +161,36 @@ def _regression_features(training): return var -def _classification_features(): - """Classification features.""" +def _classification_features(tmva_style=False): + """ + Classification features. + + Parameters + ---------- + tmva_style : bool, optional + If True, return features matching TMVA BDT input (default: False). + + Returns + ------- + list + List of feature names. + """ + if tmva_style: + # Base features from ROOT file needed to compute derived features + return [ + "DispNImages", + "EChi2S", + "EmissionHeight", + "EmissionHeightChi2", + "MSCW", + "MSCL", + "SizeSecondMax", + "Xcore", + "Ycore", + "DispAbsSumWeigth", + "ArrayPointing_Elevation", + ] + var_tel = telescope_features("classification") var_array = [ "DispNImages", @@ -162,6 +215,9 @@ def clip_intervals(): """ Get clip intervals for variables. + Clip intervals are defined based on physical bounds or ranges used + in the classical TMVA BDT analysis. + Returns ------- dict @@ -183,8 +239,8 @@ def clip_intervals(): # Energy-related variables - log10 transformation with lower bound "Erec": (energy_min, None), "ErecS": (energy_min, None), - "EChi2S": (energy_min, None), - "EmissionHeightChi2": (1e-6, None), + "EChi2S": (energy_min, 10000.0), # TMVA: -6 to 4 (before log10) + "EmissionHeightChi2": (1e-6, 10000.0), # TMVA: -11 to 4 (before log10) "img2_ang": (0.0, 360.0), # Per-telescope energy and size variables - log10 transformation with lower bound "size": (1, None), @@ -195,8 +251,13 @@ def clip_intervals(): # Derived variables - avoid numerical issues "size_dist2": (1.0, None), "tgrad_x": (-50.0, 50.0), + "MSCW": (-2.0, 2.0), + "MSCL": (-2.0, 5.0), + "SizeSecondMax": (1e-6, 100000.0), # TMVA: 0 to 5 (after log10) + "Core_Distance": (0.0, 1000.0), # sqrt(Xcore^2 + Ycore^2) + "DispAbsSumWeigth": (0.0, 5.0), # TMVA: 0 to 5 # Physical bounds - "EmissionHeight": (0, 120), # top of atmosphere + "EmissionHeight": (0, 100), # top of atmosphere "R_core": (-10, None), # badly reconstructed events } diff --git a/tests/test_classification_apply_interpolation.py b/tests/test_classification_apply_interpolation.py index a0d0e67..198d71e 100644 --- a/tests/test_classification_apply_interpolation.py +++ b/tests/test_classification_apply_interpolation.py @@ -70,3 +70,25 @@ def test_apply_classification_models_interpolates_probabilities_and_thresholds(m # Thresholds are interpolated the same way: [0.5, 0.7] np.testing.assert_array_equal(is_gamma[50], np.array([0, 1], dtype=np.uint8)) + + +def test_extra_columns_skip_tmva_only_size_second_max_when_branch_missing(): + """Standard classification should not synthesize the TMVA-only SizeSecondMax column.""" + df = pd.DataFrame( + { + "MSCW": [0.1, 0.2], + "MSCL": [0.3, 0.4], + "EChi2S": [1.0, 2.0], + "EmissionHeight": [10.0, 20.0], + "EmissionHeightChi2": [0.5, 0.25], + } + ) + + result = data_processing.extra_columns( + df, + analysis_type="classification", + training=True, + index=pd.RangeIndex(2), + ) + + assert "SizeSecondMax" not in result.columns diff --git a/tests/test_data_processing.py b/tests/test_data_processing.py new file mode 100644 index 0000000..e565113 --- /dev/null +++ b/tests/test_data_processing.py @@ -0,0 +1,48 @@ +"""Tests for data_processing.""" + +import numpy as np +import pandas as pd + +from eventdisplay_ml.data_processing import energy_interpolation_bins, zenith_in_bins + + +def test_zenith_in_bins_numeric_edges_clips_and_handles_boundaries(): + """Numeric bin edges should clip out-of-range values and place edge values consistently.""" + zenith_angles = np.array([-5.0, 0.0, 9.9, 10.0, 19.9, 20.0, 42.0], dtype=float) + bins = [0.0, 10.0, 20.0, 30.0] + + result = zenith_in_bins(zenith_angles, bins) + + np.testing.assert_array_equal(result, np.array([0, 0, 0, 1, 1, 2, 2], dtype=np.int32)) + assert result.dtype == np.int32 + + +def test_zenith_in_bins_dict_bins_matches_numeric_definition(): + """Dict-style zenith bins should map to the same indices as explicit numeric edges.""" + zenith_angles = np.array([-5.0, 0.0, 9.9, 10.0, 19.9, 20.0, 42.0], dtype=float) + dict_bins = [ + {"Ze_min": 0.0, "Ze_max": 10.0}, + {"Ze_min": 10.0, "Ze_max": 20.0}, + {"Ze_min": 20.0, "Ze_max": 30.0}, + ] + + result = zenith_in_bins(zenith_angles, dict_bins) + + np.testing.assert_array_equal(result, np.array([0, 0, 0, 1, 1, 2, 2], dtype=np.int32)) + assert result.dtype == np.int32 + + +def test_energy_interpolation_bins_interpolates_and_clamps_with_invalid_events(): + """Interpolation bins should handle interior, edge, and invalid energies robustly.""" + df_chunk = pd.DataFrame({"Erec": [0.0, 0.1, 1.0, 10.0, 100.0]}) + bins = [ + {"E_min": -1.5, "E_max": -0.5}, + None, + {"E_min": 0.5, "E_max": 1.5}, + ] + + e_bin_lo, e_bin_hi, e_alpha = energy_interpolation_bins(df_chunk, bins) + + np.testing.assert_array_equal(e_bin_lo, np.array([-1, 0, 0, 0, 2], dtype=np.int32)) + np.testing.assert_array_equal(e_bin_hi, np.array([-1, 0, 2, 2, 2], dtype=np.int32)) + np.testing.assert_allclose(e_alpha, np.array([0.0, 0.0, 0.5, 1.0, 0.0], dtype=np.float32))