From b4748aebbdbbaa04d8e0523850070d34a9a1d234 Mon Sep 17 00:00:00 2001
From: Tucker Hwang <tucker_hwang@berkeley.edu>
Date: Mon, 13 Apr 2026 22:55:24 -0700
Subject: [PATCH 1/3] Expand basic analysis framework

---
 alian/analysis/base/__init__.py    |  20 +-
 alian/analysis/base/analysis.py    | 166 +++++-
 alian/analysis/base/csubtractor.py |   2 +-
 alian/analysis/base/event.py       |  99 ++++
 alian/analysis/base/jet_finder.py  | 104 ++++
 alian/analysis/base/logs.py        |  42 ++
 alian/analysis/base/output.py      | 128 +++++
 alian/analysis/base/selection.py   | 143 +++++
 alian/analysis/base/selector.py    | 803 +++++++++++++++++++++++++++++
 alian/analysis/base/utils.py       |  61 +++
 10 files changed, 1564 insertions(+), 4 deletions(-)
 create mode 100644 alian/analysis/base/event.py
 create mode 100644 alian/analysis/base/jet_finder.py
 create mode 100644 alian/analysis/base/logs.py
 create mode 100644 alian/analysis/base/output.py
 create mode 100644 alian/analysis/base/selection.py
 create mode 100644 alian/analysis/base/selector.py
 create mode 100644 alian/analysis/base/utils.py

diff --git a/alian/analysis/base/__init__.py b/alian/analysis/base/__init__.py
index bbf918b..7507403 100644
--- a/alian/analysis/base/__init__.py
+++ b/alian/analysis/base/__init__.py
@@ -1,2 +1,18 @@
-from .analysis import *
-from .csubtractor import *
+from .analysis import AnalysisBase, add_default_args
+from .csubtractor import CEventSubtractor
+from .event import Event
+from .jet_finder import JetFinder
+from .logs import set_up_logger
+from .selection import EventSel, RCTSel, TrackSel, TrigSel
+from .selector import AnalysisSelector
+from .utils import delta_R, linbins, logbins, ndict, nested_dict, read_yaml
+
+__all__ = ["AnalysisBase", "add_default_args",
+           "CEventSubtractor",
+           "Event",
+           "JetFinder",
+           "set_up_logger",
+           "EventSel", "RCTSel", "TrackSel", "TrigSel",
+           "AnalysisSelector",
+           "delta_R", "linbins", "logbins", "ndict", "nested_dict", "read_yaml"
+           ]
diff --git a/alian/analysis/base/analysis.py b/alian/analysis/base/analysis.py
index 681a0fa..3159f03 100644
--- a/alian/analysis/base/analysis.py
+++ b/alian/analysis/base/analysis.py
@@ -1,4 +1,17 @@
-import heppyy 
+from time import perf_counter
+
+from alian.io import data_io
+from ROOT import TFile
+
+import heppyy
+
+from .event import Event, get_selected_clusters, get_selected_tracks
+from .jet_finder import JetFinder
+from .logs import set_up_logger
+from .output import Output
+from .selector import AnalysisSelector
+from .utils import is_slurm, read_yaml
+
 
 class BaseAnalysis(heppyy.GenericObject):
     _defaults = {}
@@ -8,3 +21,154 @@ def __init__(self, **kwargs):
         for k, val in self.__class__._defaults.items():
             if not hasattr(self, k) or getattr(self, k) is None:
                 setattr(self, k, val)
+
+class AnalysisBase:
+    """A base class for analysis tasks.
+
+    This base class describes the basic event analysis structure.
+    At minimum, the method `analyze_event()` must be overridden. Depending
+    on the selectors defined in the configuration, clusters may or may not be
+    loaded into the event.
+    """
+    _defaults = {}
+
+    def __init__(self, input_file, output_file, cfg, tree_struct, nev = -1, lhc_run = 3):
+        self.input_file = input_file
+        self.output_file = output_file
+        self.cfg_file = cfg
+        self.tree_struct = tree_struct
+        self.nev = nev
+        self.lhc_run = lhc_run
+
+    def run(self):
+        self.start_time = perf_counter()
+        # set up logger
+        self.logger = set_up_logger(__name__)
+        self.logger.info("Starting analysis!")
+
+        self.logger.info("Configuring analysis pipeline...")
+        self.init()
+        self.logger.info("Analysis pipeline configured.")
+        self.analyze_events()
+        self.finalize()
+        self.save()
+
+        self.note_time("Analysis complete")
+
+    def init(self):
+        self.cfg = read_yaml(self.cfg_file)
+        # validate YAML config
+        self._validate_cfg(self.cfg)
+        self.logger.info("Config schema validated.")
+        # initialize data input
+        self.data_source = data_io.DataInput(self.input_file,
+                                             lhc_run = self.lhc_run,
+                                             yaml_file = self.tree_struct,
+                                             n_events = self.nev)
+        # initialize selectors for events, tracks, clusters
+        self.logger.info("Configuring selectors...")
+        self.selector = AnalysisSelector.load(self.cfg)
+        self.selector.dump()
+        self.logger.info("Selectors configured.")
+        if self.selector.is_active("cluster"):
+            self.logger.info("Cluster selector found, clusters will be loaded into events.")
+            self.load_clusters = True
+        else:
+            self.logger.info("Cluster selector not found, clusters will NOT be loaded into events.")
+            self.load_clusters = False
+        # initialize jet finder
+        if 'jet_finder' in self.cfg:
+            self.logger.info("Jet finder config found, jets will be loaded into events.")
+            self.load_jets = True
+            self.logger.info("Configuring jet finder...")
+            self.jet_finder = JetFinder.load(self.cfg)
+            self.jet_finder.dump()
+            self.logger.info("Jet finder configured.")
+        else:
+            self.logger.info("No configuration for jet finder, no jets will be loaded.")
+            self.load_jets = False
+        self.logger.info("Configuring output...")
+        self.init_output()
+        self.logger.info("Output configured.")
+        # initialize analysis parameters
+        self.logger.info("Configuring analysis parameters...")
+        self.init_analysis(self.cfg.get('analysis', {}))
+        self.dump()
+        self.logger.info("Analysis parameters configured.")
+
+    def _validate_cfg(self, cfg: dict):
+        req_headers = ['selections', 'output']
+        if headers_not_in_cfg := [h for h in req_headers if h not in cfg]:
+            raise KeyError(f"The following blocks must be present in the YAML configuration: {headers_not_in_cfg}")
+
+    def init_analysis(self, analysis_cfg: dict):
+        """Initialize analysis configuration parameters. Override in analyses if necessary."""
+        self.logger.info("No analysis configuration to parse.")
+
+    def init_output(self):
+        self.output = Output.load(self.cfg)
+        self.hists = self.output.hists
+        self.trees = self.output.trees
+
+    def analyze_events(self):
+        self.logger.info("Analyzing events...")
+        slurm_check = is_slurm()
+        for e in self.data_source.next_event(disable_bar = slurm_check):
+            # build the event only
+            self.build_event(e)
+            # skip if the event doesn't pass the selection
+            if not self.selector.event.selects(self.event):
+                continue
+            # build the rest of the event and analyze
+            self.build_event_objs(e)
+            self.analyze_event()
+        self.note_time("Events analyzed")
+
+    def analyze_event(self):
+        raise NotImplementedError("Must implement an analyze_event method!")
+
+    def build_event(self, event_struct):
+        """Build event as Event and store in self.event."""
+        self.event = Event(event_struct)
+    def build_event_objs(self, event_struct):
+        """Build tracks and optionally clusters and jets from associated event."""
+        self.tracks = get_selected_tracks(event_struct, self.selector.track)
+        if self.load_clusters:
+            self.clusters = get_selected_clusters(event_struct, self.selector.cluster)
+        if self.load_jets:
+            self.jets = self.jet_finder.find_jets(self.tracks)
+
+    def finalize(self):
+        """Finalize analysis before saving output. Should be overriden in analyses if necessary."""
+        self.logger.info("Nothing to finalize.")
+    def save(self):
+        self.logger.info(f"Saving output to: {self.output_file}")
+        with TFile(self.output_file, "RECREATE") as f:
+            self.output.save(f)
+        self.logger.info("Output saved.")
+    def __getattr__(self, attr):
+        if attr == "jets":
+            raise AttributeError("Jets not defined; include a `jet_finder` block in your config!")
+        if attr == "clusters":
+            raise AttributeError("Clusters not defined; include a `clusters` block in the `selections` section of your config! The block can be empty if you want no selections applied.")
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attr}'")
+
+    def dump(self):
+        cfg = "\n".join([f"\t{param}: {repr(getattr(self, param))}" for param in self._defaults])
+        self.logger.info(f"{type(self).__name__} configuration:\n{cfg}", stacklevel = 2)
+    def note_time(self, msg):
+        self.logger.info(f"{msg}: ------- {self.fmt_time(perf_counter() - self.start_time)} -------", stacklevel = 2)
+    def fmt_time(self, seconds):
+        m, s = divmod(round(seconds), 60)
+        h, m = divmod(m, 60)
+        return f"{h:d}h {m:02d}m {s:02d}s"
+
+def add_default_args(parser):
+    parser.add_argument('-i', '--input-file',  type = str, required = True,           help = "Input file or file containing a list of input files.")
+    parser.add_argument("-o", "--output-file", type = str, default = "analysis.root", help = "File to write the analysis.")
+    parser.add_argument('-c', '--config-file', type = str, required = True,           help = "Path to a YAML configuration file describing the cuts to be applied to the data.")
+    parser.add_argument("-n", "--nev",         type = int, default = -1,              help = "Number of entries to process, set to -1 for all events.")
+    parser.add_argument('-t', '--tree-struct', type = str, default = None,            help = "Path to a YAML file describing the tree structure.")
+    parser.add_argument('--lhc-run',           type = int, default = 3,               help = 'LHC Run')
+
+    return parser
diff --git a/alian/analysis/base/csubtractor.py b/alian/analysis/base/csubtractor.py
index 52793d3..e960285 100644
--- a/alian/analysis/base/csubtractor.py
+++ b/alian/analysis/base/csubtractor.py
@@ -6,7 +6,7 @@
 fj = heppyy.load_cppyy('fastjet')
 std = heppyy.load_cppyy('std')
 from alian.utils import pprints
-from alian.analysis.base import BaseAnalysis
+from .analysis import BaseAnalysis
 
 class CEventSubtractor(BaseAnalysis):
 	_defaults = {
diff --git a/alian/analysis/base/event.py b/alian/analysis/base/event.py
new file mode 100644
index 0000000..05db39d
--- /dev/null
+++ b/alian/analysis/base/event.py
@@ -0,0 +1,99 @@
+import heppyy.util.fastjet_cppyy
+from cppyy.gbl import fastjet as fj
+from cppyy.gbl import std
+
+import heppyy
+
+from .selection import EventSel, RCTSel, TrigSel
+
+alian = heppyy.load_cppyy("alian")
+
+class Event:
+    def __init__(self, ev):
+        self.run_number = ev.data["run_number"]
+        self.multiplicity = ev.data["multiplicity"]
+        self.centrality = ev.data["centrality"]
+        self.occupancy = ev.data["occupancy"]
+        self.event_sel = EventSel(ev.data["event_sel"])
+        self.trig_sel = TrigSel(ev.data["trig_sel"])
+        self.rct = RCTSel(ev.data["rct"])
+
+
+def get_tracks(ev):
+    """Get all tracks from an event (no track selections applied)."""
+    return alian.numpy_ptetaphi_to_tracks(
+        ev.data["track_pt"],
+        ev.data["track_eta"],
+        ev.data["track_phi"],
+        ev.data["track_sel"],
+        0,
+    )
+
+
+def get_selected_tracks(ev, selector):
+    """Get selected tracks from an event (track selections applied)."""
+    return std.vector[fj.PseudoJet](
+        [
+            t
+            for t in alian.numpy_ptetaphi_to_tracks(
+                ev.data["track_pt"],
+                ev.data["track_eta"],
+                ev.data["track_phi"],
+                ev.data["track_sel"],
+                0,
+            )
+            if selector.selects(t)
+        ]
+    )
+
+
+def get_clusters(ev):
+    """Get all clusters from an event (no cluster selections applied)."""
+    return alian.numpy_energyetaphi_to_clusters(
+        ev.data["cluster_energy"],
+        ev.data["cluster_eta"],
+        ev.data["cluster_phi"],
+        ev.data["cluster_m02"],
+        ev.data["cluster_m20"],
+        ev.data["cluster_ncells"],
+        ev.data["cluster_time"],
+        ev.data["cluster_exoticity"],
+        ev.data["cluster_dbc"],
+        ev.data["cluster_nlm"],
+        ev.data["cluster_defn"],
+        ev.data["cluster_matched_track_n"],
+        ev.data["cluster_matched_track_delta_eta"],
+        ev.data["cluster_matched_track_delta_phi"],
+        ev.data["cluster_matched_track_p"],
+        ev.data["cluster_matched_track_pt"],
+        ev.data["cluster_matched_track_sel"],
+        0,
+    )
+
+
+def get_selected_clusters(ev, selector):
+    """Get selected clusters from an event (cluster selections applied)."""
+    return [
+        c
+        for c in alian.numpy_energyetaphi_to_clusters(
+            ev.data["cluster_energy"],
+            ev.data["cluster_eta"],
+            ev.data["cluster_phi"],
+            ev.data["cluster_m02"],
+            ev.data["cluster_m20"],
+            ev.data["cluster_ncells"],
+            ev.data["cluster_time"],
+            ev.data["cluster_exoticity"],
+            ev.data["cluster_dbc"],
+            ev.data["cluster_nlm"],
+            ev.data["cluster_defn"],
+            ev.data["cluster_matched_track_n"],
+            ev.data["cluster_matched_track_delta_eta"],
+            ev.data["cluster_matched_track_delta_phi"],
+            ev.data["cluster_matched_track_p"],
+            ev.data["cluster_matched_track_pt"],
+            ev.data["cluster_matched_track_sel"],
+            0,
+        )
+        if selector.selects(c)
+    ]
diff --git a/alian/analysis/base/jet_finder.py b/alian/analysis/base/jet_finder.py
new file mode 100644
index 0000000..093d253
--- /dev/null
+++ b/alian/analysis/base/jet_finder.py
@@ -0,0 +1,104 @@
+from functools import singledispatchmethod
+from pathlib import Path
+
+import heppyy
+
+from .logs import set_up_logger
+from .utils import read_yaml
+
+alian = heppyy.load_cppyy("alian")
+fj = heppyy.load_cppyy('fastjet')
+
+class JetFinder:
+    """Helper class for jet finding.
+
+    This class stores jet finding parameters, initialized via YAML, and returns jets with
+    `find_jets()`
+
+    Attributes:
+        R (str): The jet resolution / radius parameter.
+        alg (fastjet.JetDefinition): The jet definition.
+        pT_min (int): The minimum jet transverse momentum in GeV.
+    """
+
+    _defaults = {
+        'R': 0.4,
+        'alg': fj.antikt_algorithm,
+        'pT_min': 10,
+    }
+    _defaults["eta_max"] = 0.9 - _defaults["R"]
+
+    alg_registry = {
+        fj.antikt_algorithm: ['antikt', 'anti-kt', 'antikT', 'anti-kT', 'anti', 'ak', -1],
+        fj.kt_algorithm: ['kt', 'kT', 'k', 1],
+        fj.cambridge_algorithm: ['ca', 'CA', 'cambridge' 'cambridge-aachen', 'cambridge/aachen', 0],
+    }
+    alg_map = { # mapping from string to fj.JetAlgorithm
+        key: value
+        for value, keys in alg_registry.items()
+        for key in keys
+    }
+
+    def __init__(self, alg = fj.antikt_algorithm, R = 0.4, eta_max = None, pT_min = 10):
+        self.logger = set_up_logger(__name__)
+        self.R = R
+        if eta_max is None:
+            self.eta_max = 0.9 - self.R
+        else:
+            self.eta_max = eta_max
+        self.alg = self._parse_jet_alg(alg)
+        self.jet_def = fj.JetDefinition(self.alg, self.R)
+        self.pT_min = pT_min
+        self.jet_selector = fj.SelectorAbsEtaMax(self.eta_max) * fj.SelectorPtMin(pT_min)
+        fj.ClusterSequence.print_banner()
+        # TODO: jet background subtraction
+        # self.area_def = fj.AreaDefinition(fj.active_area, fj.GhostedAreaSpec(self.eta_max + self.jet_R))
+        # self.bg_estimator = fj.GridMedianBackgroundEstimator(self.bg_y_max, self.bg_grid_spacing)
+
+    @singledispatchmethod
+    @classmethod
+    def load(cls, file, *args, **kwargs):
+        raise NotImplementedError(f'Cannot configure jet finder from type {type(file)}.')
+    @load.register(dict)
+    @classmethod
+    def _load(cls, cfg):
+        if "jet_finder" not in cfg:
+            raise KeyError("The JetFinder must be configured in a 'jet_finder' block in the YAML configuration!")
+        if cfg['jet_finder'] is None:
+            cfg["jet_finder"] = {}
+
+        options = {**cls._defaults, **cfg["jet_finder"]}
+
+        return cls(**options)
+    @load.register(str)
+    @load.register(Path)
+    @classmethod
+    def _load(cls, file):
+        cfg = read_yaml(file)
+        return cls.load(cfg)
+
+    def dump(self):
+        """Dump all jet finding parameters."""
+        cfg = "\n".join([f"\t{param}: {repr(getattr(self, param))}" for param in self._defaults])
+        self.logger.info(f"JetFinder configuration:\n{cfg}", stacklevel = 2)
+
+    def _parse_jet_alg(self, alg: str):
+        if isinstance(alg, fj.JetAlgorithm):
+            return alg
+        elif isinstance(alg, str):
+            try:
+                return self.alg_map[alg]
+            except KeyError:
+                valid_algs = sum(self.alg_registry.values(), [])
+                self.logger.error(f"Did not recognize jet algorithm '{alg}', defaulting to anti-kT. Valid options: {valid_algs}")
+                return fj.antikt_algorithm
+        else:
+            valid_algs = sum(self.alg_registry.values(), [])
+            self.logger.error(f"Cannot pass type {type(alg)} as jet algorithm, defaulting to anti-kT. Valid options: {valid_algs}")
+            return fj.antikt_algorithm
+
+    def find_jets(self, tracks, m = 0.13957, index_offset = 0):
+        # NOTE: ClusterSequence must be attached somehow to self to keep it in scope
+        self.cs = fj.ClusterSequence(tracks, self.jet_def)
+        jets = fj.sorted_by_pt(self.jet_selector(self.cs.inclusive_jets()))
+        return jets
diff --git a/alian/analysis/base/logs.py b/alian/analysis/base/logs.py
new file mode 100644
index 0000000..cb26263
--- /dev/null
+++ b/alian/analysis/base/logs.py
@@ -0,0 +1,42 @@
+import logging
+
+from .utils import is_slurm
+
+
+class ColoredFormatter(logging.Formatter):
+    COLORS = {
+        'DEBUG': '\033[34m',
+        'INFO': '\033[32m',
+        'WARNING': '\033[33m',
+        'ERROR': '\033[31m',
+        'CRITICAL': '\033[35m'
+    }
+    RESET = '\033[0m'
+
+    def format(self, record):
+        color = self.COLORS.get(record.levelname, '')
+        if color:
+            # Color the entire line
+            formatted_msg = super().format(record)
+            return f"{color}{formatted_msg}{self.RESET}"
+        return super().format(record)
+
+def set_up_logger(name,
+                  handler_level = logging.INFO,
+                  logger_level = logging.DEBUG,
+                  log_fmt = '%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(funcName)s - %(message)s',
+                  date_fmt = '%Y-%m-%d %H:%M:%S'
+    ):
+    logger = logging.getLogger(name)
+    handler = logging.StreamHandler()
+
+    if is_slurm():
+        formatter = logging.Formatter(log_fmt, datefmt = date_fmt)
+    else:
+        formatter = ColoredFormatter(log_fmt, datefmt = date_fmt)
+
+    handler.setFormatter(formatter)
+    handler.setLevel(handler_level)
+    logger.addHandler(handler)
+    logger.setLevel(logger_level)
+    return logger
\ No newline at end of file
diff --git a/alian/analysis/base/output.py b/alian/analysis/base/output.py
new file mode 100644
index 0000000..a037d59
--- /dev/null
+++ b/alian/analysis/base/output.py
@@ -0,0 +1,128 @@
+from functools import singledispatchmethod
+from pathlib import Path
+
+import numpy as np
+import ROOT as r
+
+from .logs import set_up_logger
+from .utils import linbins, logbins, read_yaml
+
+r.TH1.AddDirectory(False)
+r.TH1.SetDefaultSumw2()
+r.TH2.SetDefaultSumw2()
+
+registry = {
+    r.TH1F: [1, '1', '1F', '1f'],
+    r.TH2F: [2, '2', '2F', '2f'],
+    r.TH3F: [3, '3', '3F', '3f'],
+    r.TH1D: ['1D', '1d'],
+    r.TH2D: ['2D', '2d'],
+    r.TH3D: ['3D', '3d'],
+    r.TH1I: ['1I', '1i'],
+    r.TH2I: ['2I', '2i'],
+    r.TH3I: ['3I', '3i'],
+}
+
+# reverse registry mapping to map key to specific ROOT class
+HISTOGRAM_REGISTRY = {
+    key: value
+    for value, keys in registry.items()
+    for key in keys
+}
+
+class Output:
+    def __init__(self, bins, histograms, branches, trees):
+        self.logger = set_up_logger(__name__)
+        self.logger.info("Initializing histograms...", stacklevel = 2)
+        self._init_bins(bins)
+        self._init_histograms(histograms)
+        self.logger.info("Histograms initialized.", stacklevel = 2)
+        self.logger.info("Initializing trees...", stacklevel = 2)
+        self._init_branches(branches)
+        self._init_trees(trees)
+        self.logger.info("Trees initialized.", stacklevel = 2)
+
+    @singledispatchmethod
+    @classmethod
+    def load(cls, file):
+        raise NotImplementedError(f'Cannot configure output from type {type(file)}.')
+    @load.register(dict)
+    @classmethod
+    def _load(cls, cfg):
+        if "output" not in cfg:
+            raise KeyError("Output must be configured in a 'output' block in the YAML configuration!")
+        if cfg["output"] is None:
+            raise KeyError("The 'output' block in the YAML configuration cannot be empty!")
+
+        cfg_output = {}
+        fields = ["bins", "histograms", "branches", "trees"]
+        for field in fields:
+            if field not in cfg["output"] or cfg["output"][field] is None:
+                cfg_output[field] = {}
+            else:
+                cfg_output[field] = cfg["output"][field]
+
+        return cls(**cfg_output)
+    @load.register(str)
+    @load.register(Path)
+    @classmethod
+    def _load(cls, file):
+        cfg = read_yaml(file)
+        return cls.load(cfg)
+
+    def _init_bins(self, bins_cfg):
+        self._bins = {}
+        self._nbins = {}
+        for name, arglist in bins_cfg.items():
+            self._bins[name] = self._calculate_bins(*arglist)
+            self._nbins[name] = len(self._bins[name]) - 1
+
+    def _calculate_bins(self, binning, *bin_info):
+        match binning:
+            case 'logarithmic' | 'logarithm' | 'log' | 'lg':
+                binmin, binmax, nbins = bin_info
+                return logbins(binmin, binmax, nbins)
+            case 'linear' | 'line' | 'lin' | 'ln':
+                binmin, binmax, nbins = bin_info
+                return linbins(binmin, binmax, nbins)
+            case 'custom' | 'cst' | 'c':
+                return np.array(bin_info[0], dtype = float)
+            case _:
+                raise ValueError(f"Binning type {binning} invalid.")
+
+    def _init_histograms(self, hists_cfg):
+        self.hists = {}
+        names = []
+        # for now, doesn't support nested structures, but could in the future
+        for tag, cfg in hists_cfg.items():
+            htype, name, title, *bin_names = cfg
+            self._validate_bin_names(bin_names)
+            # interlace number of bins and bin arrays for each axis
+            binnings = [val for name in bin_names for val in (self._nbins[name], self._bins[name])]
+            root_hist_cls = HISTOGRAM_REGISTRY[htype]
+            self.hists[tag] = root_hist_cls(name, title, *binnings)
+            names.append(name)
+        duplicates = {name: names.count(name) for name in names if names.count(name) > 1}
+        if duplicates:
+            self.logger.error(f"Multiple ROOT histograms defined with the same name, they will overwrite each other: {list(duplicates.keys())}")
+
+    def _validate_bin_names(self, bin_names):
+        """Check bin names from Histogram configs are defined."""
+        invalid_names = [name for name in bin_names if name not in self._bins]
+        if invalid_names:
+            raise KeyError(f"The following binnings are undefined: {', '.join(invalid_names)}")
+
+    def _init_branches(self, branches):
+        # TODO: implement TTrees as output objects
+        pass
+
+    def _init_trees(self, trees):
+        # TODO: implement TTrees as output objects
+        self.trees = {}
+
+    def save(self, tfile):
+        """Write output to an *already-opened* TFile."""
+        for h in self.hists.values():
+            tfile.WriteTObject(h)
+        for t in self.trees.values():
+            tfile.WriteTObject(t)
diff --git a/alian/analysis/base/selection.py b/alian/analysis/base/selection.py
new file mode 100644
index 0000000..494a5a9
--- /dev/null
+++ b/alian/analysis/base/selection.py
@@ -0,0 +1,143 @@
+from enum import IntFlag, auto
+
+from numpy import number
+
+
+class Selection(IntFlag):
+    @classmethod
+    def max_value(cls):
+        return 2 ** len(cls) - 1
+
+    def __invert__(self):
+        return self.__class__(self.__class__.max_value() - int(self))
+
+    def __format__(self, format_spec):
+        return str(self)
+
+    @classmethod
+    def create(cls, value):
+        # can't put this inside _missing_ since _missing_ must return a Selection object
+        if value is None:
+            return None
+        return cls(value)
+
+    @classmethod
+    def _missing_(cls, value):
+        if isinstance(value, str):
+            if value == "any":
+                return ~cls(0)
+            else:
+                res = cls(0)
+                for sel in value.split(','):
+                    res |= cls[sel.strip()]
+                return res
+        elif isinstance(value, number):
+            return cls(value.item())
+        elif isinstance(value, int) and value >= 0:
+            if value <= cls.max_value():
+                return super()._missing_(value)
+            else:
+                raise ValueError(f"{cls.__name__}: {value} is out of range ({cls.max_value()}).")
+        elif isinstance(value, cls):
+            return value
+        else:
+            raise ValueError(f"{cls.__name__}: {value} is invalid. Valid selections: {cls._member_names_}")
+
+# selections from O2Physics::daily-20260302-0000
+class EventSel(Selection):
+    sel8 = auto()
+    sel7 = auto()
+    selKINT7 = auto()
+    selTVX = auto()
+    selNoTimeFrameBorder = auto()
+    selNoITSROFrameBorder = auto()
+    selNoSameBunchPileup = auto()
+    selIsGoodZvtxFT0vsPV = auto()
+    selNoCollInTimeRangeStandard = auto()
+    selNoCollInRofStandard = auto()
+    selUpcSingleGapA = auto()
+    selUpcSingleGapC = auto()
+    selUpcDoubleGap = auto()
+    sel8Full = sel8 | selNoSameBunchPileup
+    sel8FullPbPb = sel8 | selNoCollInTimeRangeStandard | selNoCollInRofStandard
+    selUnanchoredMC = selTVX
+    selMC = selTVX | selNoTimeFrameBorder
+    selMCFull = selTVX | selNoTimeFrameBorder | selNoSameBunchPileup
+    selMCFullPbPb = selTVX | selNoCollInTimeRangeStandard | selNoCollInRofStandard
+    sel7KINT7 = sel7 | selKINT7
+
+# selections from O2Physics::daily-20260302-0000
+class TrigSel(Selection):
+    noTrigSel = auto()
+    JetChLowPt = auto()
+    JetChHighPt = auto()
+    TrackLowPt = auto()
+    TrackHighPt = auto()
+    JetD0ChLowPt = auto()
+    JetD0ChHighPt = auto()
+    JetLcChLowPt = auto()
+    JetLcChHighPt = auto()
+    EMCALReadout = auto()
+    JetFullHighPt = auto()
+    JetFullLowPt = auto()
+    JetNeutralHighPt = auto()
+    JetNeutralLowPt = auto()
+    GammaVeryHighPtEMCAL = auto()
+    GammaVeryHighPtDCAL = auto()
+    GammaHighPtEMCAL = auto()
+    GammaHighPtDCAL = auto()
+    GammaLowPtEMCAL = auto()
+    GammaLowPtDCAL = auto()
+    GammaVeryLowPtEMCAL = auto()
+    GammaVeryLowPtDCAL = auto()
+
+# selections from O2Physics::daily-20260302-0000
+class RCTSel(Selection):
+    CPVBad = auto()
+    EMCBad = auto()
+    EMCLimAccMCRepr = auto()
+    FDDBad = auto()
+    FT0Bad = auto()
+    FV0Bad = auto()
+    HMPBad = auto()
+    ITSBad = auto()
+    ITSLimAccMCRepr = auto()
+    MCHBad = auto()
+    MCHLimAccMCRepr = auto()
+    MFTBad = auto()
+    MFTLimAccMCRepr = auto()
+    MIDBad = auto()
+    MIDLimAccMCRepr = auto()
+    PHSBad = auto()
+    TOFBad = auto()
+    TOFLimAccMCRepr = auto()
+    TPCBadTracking = auto()
+    TPCBadPID = auto()
+    TPCLimAccMCRepr = auto()
+    TRDBad = auto()
+    ZDCBad = auto()
+    NRCTSelectionFlags = auto()
+    Dummy24 = auto()
+    Dummy25 = auto()
+    Dummy26 = auto()
+    Dummy27 = auto()
+    Dummy28 = auto()
+    Dummy29 = auto()
+    Dummy30 = auto()
+    CcdbObjectLoaded = auto()
+    CBT = FT0Bad | ITSBad | TPCBadTracking | TPCBadPID
+    CBT_LAMCRBad = CBT | ITSLimAccMCRepr | TPCLimAccMCRepr
+    CBT_hadronPID = CBT | TOFBad
+    CBT_hadronPID_LAMCRBad = CBT_hadronPID | ITSLimAccMCRepr | TPCLimAccMCRepr | TOFLimAccMCRepr
+    CBT_calo = CBT | EMCBad
+    CBT_calo_LAMCRBad = CBT_calo | ITSLimAccMCRepr | TPCLimAccMCRepr | EMCLimAccMCRepr
+
+# selections from O2Physics::daily-20260302-0000
+class TrackSel(Selection):
+    trackSign = auto() # 1 = positive, 0 = negative
+    globalTrack = auto()
+    qualityTrack = auto()
+    qualityTrackWDCA = auto()
+    hybridTrack = auto()
+    notBadMcTrack = auto()
+    embeddedTrack = auto()
diff --git a/alian/analysis/base/selector.py b/alian/analysis/base/selector.py
new file mode 100644
index 0000000..dd90f21
--- /dev/null
+++ b/alian/analysis/base/selector.py
@@ -0,0 +1,803 @@
+from functools import singledispatchmethod
+from pathlib import Path
+
+import numpy as np
+
+import heppyy
+
+from .logs import set_up_logger
+from .selection import EventSel, RCTSel, TrackSel, TrigSel
+from .utils import read_yaml
+
+alian = heppyy.load_cppyy("alian")
+
+class Selector:
+    _types = {}
+    _name = ""
+    _dummy_mask = ""
+    def __init__(self, **selections):
+        self.logger = set_up_logger(type(self).__name__)
+        self._masks = {sel: None for sel in self._types.keys()}
+        self.reset()
+        self.update(**selections)
+
+    @singledispatchmethod
+    @classmethod
+    def load(cls, file):
+        raise NotImplementedError(f'Cannot configure selector from type {type(file)}.')
+
+    @load.register(dict)
+    @classmethod
+    def _load(cls, cfg):
+        if "selections" not in cfg:
+            raise KeyError("Selectors must be defined in a 'selections' block in the YAML configuration!")
+        if cfg['selections'] is None:
+            raise KeyError("The 'selections' block in the YAML configuration cannot be empty!")
+
+        if cls._name in cfg["selections"]:
+            if cfg["selections"][cls._name] is None:
+                return cls()
+            else:
+                return cls(**cfg["selections"][cls._name])
+        else:
+            return None
+
+    @load.register(str)
+    @load.register(Path)
+    @classmethod
+    def _load(cls, file):
+        cfg = read_yaml(file)
+        return cls.load(cfg)
+
+    def reset(self):
+        """Reset all selections."""
+        for selection in self._types.keys():
+            setattr(self, selection, None)
+
+    def update(self, **selections):
+        """Set given selections, leaving all other unspecified selections as-is."""
+        valid_sels = self._validate(**selections)
+        for name, val in valid_sels.items():
+            setattr(self, name, val)
+
+    def _is_ignore_selection(self, value):
+        """Check if the selection (written as `value` in the YAML file) should be ignored."""
+        return value in [None, "None", "none"]
+
+    @property
+    def rdf_mask(self):
+        mask = " && ".join([mask for mask in self._masks.values() if mask is not None])
+        if not mask: # if no selections active, pass dummy mask
+            return self._dummy_mask
+        return mask
+    def apply_to(self, df):
+        """Apply cuts to the event RDataFrame `df`."""
+        raise NotImplementedError
+
+    def _validate(self, **selections):
+        """Validate the passed selections, and raise warnings if invalid selections were given."""
+        # validate selection names
+        unrecognized_selections = [sel for sel in selections.keys() if sel not in self._types.keys()]
+        if unrecognized_selections:
+            raise KeyError(f"Unrecognized selections were given: {', '.join(unrecognized_selections)}")
+        # validate selection types
+        recognized_selections = {sel: val for sel, val in selections.items() if sel in self._types.keys()}
+        # check that each selection value is either the right type, or if is not, then it is an allowed ignore keyword
+        wrong_types = [f"{sel}: {val} (type {type(val)}), must be {self._types[sel]}"
+                            for sel, val in recognized_selections.items()
+                            if not isinstance(val, self._types[sel]) and not self._is_ignore_selection(val) ]
+        if wrong_types:
+            wrong_types = "\n".join(wrong_types)
+            raise TypeError(f"Incorrect types for the following selections were given. Use either the given types or 'none':\n{wrong_types}")
+        return recognized_selections
+
+    def _canonicalize(self, sel, val):
+        if val in [None, "None", "none"]:
+            return None
+        if isinstance(val, self._types[sel]):
+            return val
+        raise TypeError(f"Incorrect type for {type(self).__name__}: type for {sel} must be {self._types[sel]}, not {type(val)}")
+
+    def _pass_ignore(self, *args, **kwargs):
+        return True
+    def _fail_ignore(self, *args, **kwargs):
+        return False
+
+    def selects(self, obj, verbose = False):
+        raise NotImplementedError
+
+    def dump(self):
+        """Dump all selections."""
+        cfg = "\n".join([f"\t{sel}: {getattr(self, sel)}" for sel in self._types.keys()])
+        self.logger.info(f"{type(self).__name__} configuration:\n{cfg}", stacklevel = 2)
+
+class EventSelector(Selector):
+    """An event selector, configured via YAML.
+
+    The EventSelector contains all event-level selections. To ignore a particular selection,
+    set the relevant selection to None. In particular, to analyze MB (i.e. unskimmed data),
+    `trig_sel` must be set to `None` to avoid losing all events. Multiple selections for the
+    `event_sel`, `trig_sel`, and `rct` can be combined with commas in the config. Be careful
+    with "dummy" centrality, multiplicity, and occupancy values! For example, multiplicities
+    are sometimes filled with dummy values of -999, so a "safe" minimum multiplicity cut of 0
+    might accidentally remove events you don't want to remove.
+
+    Attributes:
+        event_sel (EventSel): event selection to be applied. Events must pass ALL given selections.
+        trig_sel (TrigSel): trigger selection to be applied. Events must satisfy AT LEAST ONE selection.
+        rct (RCTSel): RCT selection to be applied. Events must not be marked Bad for ANY of the relevant detectors.
+        cent_min (int/float): minimum centrality.
+        cent_max (int/float): maximum centrality.
+        mult_min (int/float): minimum multiplicity.
+        mult_max (int/float): maximum multiplicity.
+        occupancy_min (int/float): minimum occupancy.
+        occupancy_max (int/float): maximum occupancy.
+    """
+
+    _types = {
+        "event_sel": (EventSel, str, int),
+        "trig_sel": (TrigSel, str, int),
+        "rct": (RCTSel, str, int),
+        "cent_min": (int, float),
+        "cent_max": (int, float),
+        "mult_min": (int, float),
+        "mult_max": (int, float),
+        "occupancy_min": (int, float),
+        "occupancy_max": (int, float),
+    }
+    _name = "event"
+    _dummy_mask = "event_sel >= 0"
+
+    # special handling of no-selects to allow for ignoring the event or trigger selection altogether
+    @property
+    def event_sel(self):
+        return self._event_sel
+    @event_sel.setter
+    def event_sel(self, event_sel):
+        self._event_sel = EventSel.create(self._canonicalize('event_sel', event_sel))
+        if self._event_sel is None:
+            self.pass_event_sel = self._pass_ignore
+            self._masks['event_sel'] = None
+        else:
+            self.pass_event_sel = self._pass_event_sel
+            self._masks['event_sel'] = f"(({int(self._event_sel)} & event_sel) == {int(self._event_sel)})"
+    @property
+    def trig_sel(self):
+        return self._trig_sel
+    @trig_sel.setter
+    def trig_sel(self, trig_sel):
+        self._trig_sel = TrigSel.create(self._canonicalize('trig_sel', trig_sel))
+        if self._trig_sel is None:
+            self.pass_trig_sel = self._pass_ignore
+            self._masks['trig_sel'] = None
+        else:
+            self.pass_trig_sel = self._pass_trig_sel
+            self._masks['trig_sel'] = f"({int(self._trig_sel)} & trig_sel)"
+    @property
+    def rct(self):
+        return self._rct
+    @rct.setter
+    def rct(self, rct):
+        # NOTE: CCDB check and ZDC have to be included in RCT manually (combine with comma in config)
+        self._rct = RCTSel.create(self._canonicalize('rct', rct))
+        if self._rct is None:
+            self.pass_rct = self._pass_ignore
+            self._masks['rct'] = None
+        else:
+            self.pass_rct = self._pass_rct
+            self._masks['rct'] = f"!({int(self._rct)} & rct)"
+    @property
+    def cent_min(self):
+        return self._cent_min
+    @cent_min.setter
+    def cent_min(self, cent_min):
+        self._cent_min = self._canonicalize('cent_min', cent_min)
+        if self._cent_min is None:
+            self.pass_cent_min = self._pass_ignore
+            self._masks['cent_min'] = None
+        else:
+            self.pass_cent_min = self._pass_cent_min
+            self._masks['cent_min'] = f"(centrality >= {self._cent_min})"
+    @property
+    def cent_max(self):
+        return self._cent_max
+    @cent_max.setter
+    def cent_max(self, cent_max):
+        self._cent_max = self._canonicalize('cent_max', cent_max)
+        if self._cent_max is None:
+            self.pass_cent_max = self._pass_ignore
+            self._masks['cent_max'] = None
+        else:
+            self.pass_cent_max = self._pass_cent_max
+            self._masks['cent_max'] = f"(centrality <= {self._cent_max})"
+    @property
+    def mult_min(self):
+        return self._mult_min
+    @mult_min.setter
+    def mult_min(self, mult_min):
+        self._mult_min = self._canonicalize('mult_min', mult_min)
+        if self._mult_min is None:
+            self.pass_mult_min = self._pass_ignore
+            self._masks['mult_min'] = None
+        else:
+            self.pass_mult_min = self._pass_mult_min
+            self._masks['mult_min'] = f"(multiplicity >= {self._mult_min})"
+    @property
+    def mult_max(self):
+        return self._mult_max
+    @mult_max.setter
+    def mult_max(self, mult_max):
+        self._mult_max = self._canonicalize('mult_max', mult_max)
+        if self._mult_max is None:
+            self.pass_mult_max = self._pass_ignore
+            self._masks['mult_max'] = None
+        else:
+            self.pass_mult_max = self._pass_mult_max
+            self._masks['mult_max'] = f"(multiplicity <= {self._mult_max})"
+    @property
+    def occupancy_min(self):
+        return self._occupancy_min
+    @occupancy_min.setter
+    def occupancy_min(self, occupancy_min):
+        self._occupancy_min = self._canonicalize('occupancy_min', occupancy_min)
+        if self._occupancy_min is None:
+            self.pass_occupancy_min = self._pass_ignore
+            self._masks['occupancy_min'] = None
+        else:
+            self.pass_occupancy_min = self._pass_occupancy_min
+            self._masks['occupancy_min'] = f"(occupancy >= {self._occupancy_min})"
+    @property
+    def occupancy_max(self):
+        return self._occupancy_max
+    @occupancy_max.setter
+    def occupancy_max(self, occupancy_max):
+        self._occupancy_max = self._canonicalize('occupancy_max', occupancy_max)
+        if self._occupancy_max is None:
+            self.pass_occupancy_max = self._pass_ignore
+            self._masks['occupancy_max'] = None
+        else:
+            self.pass_occupancy_max = self._pass_occupancy_max
+            self._masks['occupancy_max'] = f"(occupancy <= {self._occupancy_max})"
+
+    def apply_to(self, df, label = "selected"):
+        """Apply event selections to the event RDataFrame."""
+        self.logger.info(f"Applying event mask: {(mask := self.rdf_mask)}")
+        return df.Filter(mask)
+
+    def _pass_event_sel(self, event):
+        """Check that the event satisfies ALL requested event selection bits."""
+        return self._event_sel in event.event_sel
+    def _pass_trig_sel(self, event):
+        """Check that the event satisfies ANY requested trigger bit."""
+        return bool(self._trig_sel & event.trig_sel)
+    def _pass_rct(self, event):
+        """Check that the event is not Bad for ANY relevant detectors."""
+        return not bool(self._rct & event.rct)
+    def _pass_cent_min(self, event):
+        return event.centrality >= self._cent_min
+    def _pass_cent_max(self, event):
+        return event.centrality <= self._cent_max
+    def _pass_mult_min(self, event):
+        return event.multiplicity >= self._mult_min
+    def _pass_mult_max(self, event):
+        return event.multiplicity <= self._mult_max
+    def _pass_occupancy_min(self, event):
+        return event.occupancy >= self._occupancy_min
+    def _pass_occupancy_max(self, event):
+        return event.occupancy <= self._occupancy_max
+    def selects(self, event):
+        """Determine if event passes all event-level selections: event selection, trigger selection, RCT, centrality, multiplicity, occupancy."""
+        return self.pass_event_sel(event) and self.pass_trig_sel(event) and self.pass_rct(event) \
+           and self.pass_cent_min(event) and self.pass_cent_max(event) \
+           and self.pass_mult_min(event) and self.pass_mult_max(event) \
+           and self.pass_occupancy_min(event) and self.pass_occupancy_max(event)
+
+class TrackSelector(Selector):
+    """An track selector, configured via YAML.
+
+    The TrackSelector contains all track-level selections. To ignore a particular selection,
+    set the relevant selection to None.  Multiple `track_sel` can be combined with
+    commas in the config.
+
+    Attributes:
+        pt_min (int/float): minimum track transverse momentum.
+        track_sel (TrackSel): track selection to be applied. Tracks must satisfy AT LEAST ONE selection.
+    """
+
+    _types = {
+        "pt_min": (float, int),
+        "track_sel": (TrackSel, str, int)
+    }
+    _name = "track"
+    _dummy_mask = "track_pt > -1"
+
+    @property
+    def pt_min(self):
+        return self._pt_min
+    @pt_min.setter
+    def pt_min(self, pt_min):
+        self._pt_min = self._canonicalize('pt_min', pt_min)
+        if self._pt_min is None:
+            self.pass_pt_min = self._pass_ignore
+            self._masks['pt_min'] = None
+        else:
+            self.pass_pt_min = self._pass_pt_min
+            self._masks['pt_min'] = f"(track_pt >= {self._pt_min})"
+    @property
+    def track_sel(self):
+        return self._track_sel
+    @track_sel.setter
+    def track_sel(self, track_sel):
+        self._track_sel = TrackSel.create(self._canonicalize('track_sel', track_sel))
+        if self._track_sel is None:
+            self.pass_track_sel = self._pass_ignore
+            self._masks['track_sel'] = None
+        else:
+            self.pass_track_sel = self._pass_track_sel
+            self._masks['track_sel'] = f"(track_sel & {int(self._track_sel)})"
+
+    def _pass_pt_min(self, track):
+        """Check that the track has the minimum required pT"""
+        return track.pt() >= self._pt_min
+    def _pass_track_sel(self, track):
+        """Check that the track satisfies ANY requested track selection bit."""
+        return bool(self._track_sel & track.user_info[alian.TrackInfo]().track_sel())
+
+    def apply_to(self, df, label = "selected"):
+        """Apply selections to RDataFrame df."""
+        self.logger.info(f"Applying track mask: {(mask := self.rdf_mask)}")
+        ret = df.Define("track_mask", mask)
+        # create new columns based on the mask
+        track_cols = [col for col in df.GetColumnNames() if col.startswith("track_")]
+        name_map = {col: f"{col}_{label}" for col in track_cols}
+        for colname, masked_colname in name_map.items():
+            ret = ret.Define(masked_colname, f"{colname}[track_mask]")
+        return ret
+
+    def selects(self, track):
+        """Determine if track passes the track selection."""
+        return self.pass_track_sel(track) and self.pass_pt_min(track)
+
+    def __call__(self, track):
+        return self.selects(track)
+
+class ClusterSelector(Selector):
+    """An cluster selector, configured via YAML.
+
+    The ClusterSelector contains all cluster-level selections. To ignore a particular selection,
+    set the relevant selection to None. Note that, due to the BerkeleyTree structure, cuts related
+    to matched tracks cannot be applied to RDataFrames.
+
+    Attributes:
+        energy_min (int/float): minimum cluster energy.
+        pt_min (int/float): minimum cluster transverse momentum.
+        m02_min (int/float): minimum cluster M02.
+        m02_max (int/float): maximum cluster M02.
+        ncells_min (int/float): minimum number of cells in the cluster. Cluster must have ncells greater than OR EQUAL TO this value.
+        time_min (int/float): minimum cluster time.
+        time_max (int/float): maximum cluster time.
+        exoticity (bool): exoticity selection. To select non-exotic clusters, set to True. To select exotic clusters, set to False. To ignore exoticity entirely, set to None.
+        dbc (int/float): minimum distance to the nearest bad EMC channel. Note that the JE derived data currently fills this with a dummy value of 1024.
+        nlm (int): minimum number of local maxima. Cluster must have an NLM greater than OR EQUAL TO this value.
+        defn (int): filter on a cluster definition. Set to 0 for V1, 10 for V3. Multiple definitions cannot be filtered on simultaneously; if needed, set to None and check manually on analysis level.
+        edge (bool): apply the SM edge cut.
+        delta_eta (int/float): maximum difference in eta (at EMC) between cluster and track to match.
+        delta_phi (int/float): maximum difference in phi (at EMC) between cluster and track to match.
+        ep_max (int/float): maximum ratio of cluster energy to track momentum to match.
+    """
+
+    w = 0.0143 # cell granularity in eta/phi
+    sm_full = (81.2 * np.pi / 180, np.pi)
+    sm_23 = (261.2 * np.pi / 180, 318.8 * np.pi / 180)
+    sm_13e = (181.2 * np.pi / 180, 185.8 * np.pi / 180)
+    sm_13d = (321.2 * np.pi / 180, 325.8 * np.pi / 180)
+
+    _types = {
+        # cluster cuts
+        "energy_min": (float, int),
+        "pt_min": (float, int),
+        "m02_min": (float, int),
+        "m02_max": (float, int),
+        "ncells_min": (float, int),
+        "time_min": (float, int),
+        "time_max": (float, int),
+        "exoticity": bool,
+        "dbc": (float, int),
+        "nlm": int,
+        "defn": int,
+        "edge": bool,
+        "delta_eta": (float, int),
+        "delta_phi": (float, int),
+        "ep_max": (float, int),
+    }
+    _name = "cluster"
+    _dummy_mask = "cluster_pt > -1"
+
+    @property
+    def energy_min(self):
+        return self._energy_min
+    @energy_min.setter
+    def energy_min(self, energy_min):
+        self._energy_min = self._canonicalize('energy_min', energy_min)
+        if self._energy_min is None:
+            self.pass_energy_min = self._pass_ignore
+            self._masks['energy_min'] = None
+        else:
+            self.pass_energy_min = self._pass_energy_min
+            self._masks['energy_min'] = f"(cluster_energy >= {self._energy_min})"
+    @property
+    def pt_min(self):
+        return self._pt_min
+    @pt_min.setter
+    def pt_min(self, pt_min):
+        self._pt_min = self._canonicalize('pt_min', pt_min)
+        if self._pt_min is None:
+            self.pass_pt_min = self._pass_ignore
+            self._masks['pt_min'] = None
+        else:
+            self.pass_pt_min = self._pass_pt_min
+            self._masks['pt_min'] = f"(cluster_pt >= {self._pt_min})"
+    @property
+    def m02_min(self):
+        return self._m02_min
+    @m02_min.setter
+    def m02_min(self, m02_min):
+        self._m02_min = self._canonicalize('m02_min', m02_min)
+        if self._m02_min is None:
+            self.pass_m02_min = self._pass_ignore
+            self._masks['m02'] = None
+        else:
+            self.pass_m02_min = self._pass_m02_min
+            self._masks['m02'] = f"(cluster_m02 >= {self._m02_min})"
+    @property
+    def m02_max(self):
+        return self._m02_max
+    @m02_max.setter
+    def m02_max(self, m02_max):
+        self._m02_max = self._canonicalize('m02_max', m02_max)
+        if self._m02_max is None:
+            self.pass_m02_max = self._pass_ignore
+            self._masks['m02'] = None
+        else:
+            self.pass_m02_max = self._pass_m02_max
+            self._masks['m02'] = f"(cluster_m02 <= {self._m02_max})"
+    @property
+    def ncells_min(self):
+        return self._ncells_min
+    @ncells_min.setter
+    def ncells_min(self, ncells_min):
+        self._ncells_min = self._canonicalize('ncells_min', ncells_min)
+        if self._ncells_min is None:
+            self.pass_ncells_min = self._pass_ignore
+            self._masks['ncells'] = None
+        else:
+            self.pass_ncells_min = self._pass_ncells_min
+            self._masks['ncells'] = f"(cluster_ncells >= {self._ncells_min})"
+    @property
+    def time_min(self):
+        return self._time_min
+    @time_min.setter
+    def time_min(self, time_min):
+        self._time_min = self._canonicalize('time_min', time_min)
+        if self._time_min is None:
+            self.pass_time_min = self._pass_ignore
+            self._masks['time_min'] = None
+        else:
+            self.pass_time_min = self._pass_time_min
+            self._masks['time_min'] = f"(cluster_time >= {self._time_min})"
+    @property
+    def time_max(self):
+        return self._time_max
+    @time_max.setter
+    def time_max(self, time_max):
+        self._time_max = self._canonicalize('time_max', time_max)
+        if self._time_max is None:
+            self.pass_time_max = self._pass_ignore
+            self._masks['time_max'] = None
+        else:
+            self.pass_time_max = self._pass_time_max
+            self._masks['time_max'] = f"(cluster_time <= {self._time_max})"
+    @property
+    def exoticity(self):
+        return self._exoticity
+    @exoticity.setter
+    def exoticity(self, exoticity):
+        self._exoticity = self._canonicalize('exoticity', exoticity)
+        if self._exoticity is None:
+            self.pass_exotic = self._pass_ignore
+            self._masks['exoticity'] = None
+        elif self._exoticity is True:
+            self.pass_exotic = self._pass_exotic
+            self._masks['exoticity'] = "(!cluster_exoticity)"
+        elif self._exoticity is False:
+            self.logger.warning("The exoticity cut has been set to False, which will yield exotic clusters only. If you wanted to remove the exoticity check entirely, set this to 'none'.")
+            self.pass_exotic = self._fail_exotic
+            self._masks['exoticity'] = "(cluster_exoticity)"
+        else:
+            raise RuntimeError("Somehow the type checking was bypassed here...")
+    @property
+    def dbc(self):
+        return self._dbc
+    @dbc.setter
+    def dbc(self, dbc):
+        self._dbc = self._canonicalize('dbc', dbc)
+        if self._dbc is None:
+            self.pass_dbc = self._pass_ignore
+            self._masks['dbc'] = None
+        else:
+            self.pass_dbc = self._pass_dbc
+            self._masks['dbc'] = f"(cluster_dbc >= {self._dbc})"
+    @property
+    def nlm(self):
+        return self._nlm
+    @nlm.setter
+    def nlm(self, nlm):
+        self._nlm = self._canonicalize('nlm', nlm)
+        if self._nlm is None:
+            self.pass_nlm = self._pass_ignore
+            self._masks['nlm'] = None
+        else:
+            self.pass_nlm = self._pass_nlm
+            self._masks['nlm'] = f"(cluster_nlm <= {self._nlm})"
+    @property
+    def defn(self):
+        return self._defn
+    @defn.setter
+    def defn(self, defn):
+        self._defn = self._canonicalize('defn', defn)
+        if self._defn is None:
+            self.pass_defn = self._pass_ignore
+            self._masks['defn'] = None
+        else:
+            self.pass_defn = self._pass_defn
+            self._masks['defn'] = f"(cluster_defn == {self._defn})"
+    @property
+    def edge(self):
+        return self._edge
+    @edge.setter
+    def edge(self, edge):
+        self._edge = self._canonicalize('edge', edge)
+        if not self._edge: # either False or None
+            self.pass_edge = self._pass_ignore
+            self._masks['edge'] = None
+        else:
+            self.pass_edge = self._pass_edge
+            self._masks['edge'] = ("( (cluster_eta_abs < 0.67 && ("
+                    f"(cluster_phi > {self.sm_full[0]} && cluster_phi < {self.sm_full[1]}) || "
+                    f"(cluster_phi > {self.sm_13e[0]} && cluster_phi < {self.sm_13e[1]}) || "
+                    f"(cluster_phi > {self.sm_13d[0]} && cluster_phi < {self.sm_13d[1]}) )) "
+                    f"|| (cluster_eta_abs > 0.25 && cluster_eta_abs < 0.67 && cluster_phi > {self.sm_23[0]} && cluster_phi < {self.sm_23[1]}) )"
+            )
+    @property
+    def delta_eta(self):
+        return self._delta_eta
+    @delta_eta.setter
+    def delta_eta(self, delta_eta):
+        self._delta_eta = self._canonicalize('delta_eta', delta_eta)
+        if self._delta_eta is None:
+            # NOTE: no way to use a mask to properly cut on matched track info
+            self.pass_delta_eta = self._fail_ignore
+        else:
+            self.pass_delta_eta = self._pass_delta_eta
+    @property
+    def delta_phi(self):
+        return self._delta_phi
+    @delta_phi.setter
+    def delta_phi(self, delta_phi):
+        self._delta_phi = self._canonicalize('delta_phi', delta_phi)
+        if self._delta_phi is None:
+            # NOTE: no way to use a mask to properly cut on matched track info
+            self.pass_delta_phi = self._fail_ignore
+        else:
+            self.pass_delta_phi = self._pass_delta_phi
+    @property
+    def ep_max(self):
+        return self._ep_max
+    @ep_max.setter
+    def ep_max(self, ep_max):
+        self._ep_max = self._canonicalize('ep_max', ep_max)
+        if self._ep_max is None:
+            # NOTE: no way to use a mask to properly cut on matched track info
+            self.pass_ep_max = self._fail_ignore
+        else:
+            self.pass_ep_max = self._pass_ep_max
+
+    def apply_to(self, df, label = "selected"):
+        self.logger.warning("Cuts related to matched tracks cannot be properly applied to RDFs, so any cuts specified will be ignored.")
+        if 'cluster_pt' not in df.GetColumnNames():
+            df_masked =  df.Define("cluster_pt", "cluster_energy / cosh(cluster_eta)")
+        else:
+            df_masked = df
+        df_masked = df_masked.Define("cluster_eta_abs", "abs(cluster_eta)")
+        mask = self.rdf_mask
+        self.logger.info(f"Applying cluster mask: {(mask)}")
+        df_masked = df_masked.Define("cluster_mask", mask)
+        cluster_cols = [col for col in df_masked.GetColumnNames() if col.startswith("cluster_")]
+        name_map = {col: f"{col}_{label}" for col in cluster_cols}
+        for colname, masked_colname in name_map.items():
+            df_masked = df_masked.Define(masked_colname, f"{colname}[cluster_mask]")
+        return df_masked
+
+    def _pass_energy_min(self, cluster):
+        """Checks if the cluster has enough energy."""
+        return cluster.energy() >= self._energy_min
+    def _pass_pt_min(self, cluster):
+        """Checks if the cluster has large enough pT."""
+        return cluster.pt() >= self._pt_min
+    def _pass_m02_min(self, cluster):
+        return cluster.m02() >= self._m02_min
+    def _pass_m02_max(self, cluster):
+        return cluster.m02() <= self._m02_max
+    def _pass_ncells_min(self, cluster):
+        """Checks if the cluster has at least the minimum number of cells."""
+        return cluster.ncells() >= self._ncells_min
+    def _pass_time_min(self, cluster):
+        return cluster.time() >= self._time_min
+    def _pass_time_max(self, cluster):
+        return cluster.time() <= self._time_max
+    # Special handling for the exoticity cut, which is a boolean. If exoticity is
+    # set to True, then the selector will select non-exotic clusters, and vice versa.
+    # If set to None, then the cluster exoticity is ignored during selection.
+    def _pass_exotic(self, cluster):
+        """Checks if the cluster passes the exoticity cut (i.e. it is non-exotic). Used if exoticity = True."""
+        return not cluster.exoticity()
+    def _fail_exotic(self, cluster):
+        """Checks if the cluster does not pass the exoticity cut (i.e. it is exotic). Used if exoticity = False."""
+        return cluster.exoticity()
+    def _pass_dbc(self, cluster):
+        """Checks if the cluster is far enough away from a bad EMC channel."""
+        return cluster.dbc() >= self._dbc
+    def _pass_nlm(self, cluster):
+        """Checks if the cluster has few enough local maxima."""
+        return cluster.nlm() <= self._nlm
+    def _pass_defn(self, cluster):
+        """Checks the cluster definition."""
+        return cluster.defn() == self._defn
+    def _pass_edge(self, cluster):
+        # TODO: check if this cut is done right
+        """Checks if the cluster is far enough away from a SM edge."""
+        abs_eta = abs(cluster.eta())
+        phi = cluster.phi()
+        return (abs_eta < 0.67 and ( \
+                    (phi > self.sm_full[0] and phi < self.sm_full[1]) or \
+                    (phi > self.sm_13e[0] and phi < self.sm_13e[1]) or \
+                    (phi > self.sm_13d[0] and phi < self.sm_13d[1]) )) \
+            or (abs_eta > 0.25 and abs_eta < 0.67 and phi > self.sm_23[0] and phi < self.sm_23[1])
+    def pass_cluster_cuts(self, cluster):
+        return self.pass_energy_min(cluster) and self.pass_pt_min(cluster) \
+           and self.pass_m02_min(cluster)    and self.pass_m02_max(cluster) \
+           and self.pass_ncells_min(cluster) and self.pass_time_min(cluster) \
+           and self.pass_time_max(cluster)   and self.pass_exotic(cluster) \
+           and self.pass_dbc(cluster)        and self.pass_nlm(cluster) \
+           and self.pass_defn(cluster)       and self.pass_edge(cluster)
+
+    def _pass_delta_eta(self, cluster, idx):
+        """Checks if a cluster and the given matched track match in eta."""
+        return abs(cluster.matchedTrackDeltaEta()[idx]) <= self._delta_eta
+    def _pass_delta_phi(self, cluster, idx):
+        """Checks if a cluster and the given matched track match in phi."""
+        return abs(cluster.matchedTrackDeltaPhi()[idx]) <= self._delta_phi
+    def _pass_ep_max(self, cluster, idx):
+        """Checks if a cluster and track match in energy."""
+        return cluster.energy() / cluster.matchedTrackP()[idx] <= self._ep_max
+    def is_not_in_sector_edge(self, cluster, idx):
+        # TODO: implement TPC sector edge cut from LF
+        return True
+    def is_matched_to_track(self, cluster, idx):
+        """Checks if a cluster and the given matched track match."""
+        return self.pass_delta_eta(cluster, idx) and self.pass_delta_phi(cluster, idx) \
+           and self.pass_ep_max(cluster, idx) and self.is_not_in_sector_edge(cluster, idx)
+    def is_matched(self, cluster):
+        return any(self.is_matched_to_track(cluster, idx) for idx in range(cluster.matchedTrackN()))
+    def selects(self, cluster):
+        """Determine if cluster is a photon candidate and is not matched to any track."""
+        return self.pass_cluster_cuts(cluster) and not self.is_matched(cluster)
+    def __call__(self, cluster):
+        return self.selects(cluster)
+
+class IsolationSelector(Selector):
+    """An cluster selector for isolation, configured via YAML.
+
+    The IsolationSelector contains the isolation criteria for EMCal clusters. To ignore the isolation pT
+    altogether, set to None or leave unspecified in the config. Unlike other selections, `iso_cone_R` and
+    `track_eta_max` will be set to default values of 0.4 and 0.9 if None or unspecified in the config.
+
+    Attributes:
+        iso_cone_R (float/int): isolation cone radius. If None or unspecified, this is set to 0.4.
+        iso_pt_max (float/int): maximum isolation transverse momentum to be considered isolated.
+        track_eta_max (float/int): maximum pseudorapidity for tracks. Only relevant for the geometric acceptance correction. If None or unspecified, this is set to 0.9.
+    """
+
+    _types = {
+        # isolation cuts
+        "iso_cone_R": (float, int), # isocone radius
+        "iso_pt_max": (float, int), # max isocone pT (GeV)
+        "track_eta_max": (float, int), # maximum track pseudorapidity
+    }
+    _name = "isolation"
+
+    @property
+    def iso_pt_max(self):
+        return self._iso_pt_max
+    @iso_pt_max.setter
+    def iso_pt_max(self, iso_pt_max):
+        self._iso_pt_max = self._canonicalize('iso_pt_max', iso_pt_max)
+        # IsolationSelector only checks the iso pT, so we define selects() directly here
+        if self._iso_pt_max is None:
+            self.selects = self._pass_ignore
+        else:
+            self.selects = self._pass_iso_pt_max
+    @property
+    def iso_cone_R(self):
+        return self._iso_cone_R
+    @iso_cone_R.setter
+    def iso_cone_R(self, iso_cone_R):
+        self._iso_cone_R = self._canonicalize('iso_cone_R', iso_cone_R)
+        if self._iso_cone_R is None: # None value not allowed in this case, so set to 0.4
+            self.logger.warning("Isolation cone radius unspecified; setting to 0.4.")
+            self._iso_cone_R = 0.4
+    @property
+    def track_eta_max(self):
+        return self._track_eta_max
+    @track_eta_max.setter
+    def track_eta_max(self, track_eta_max):
+        self._track_eta_max = self._canonicalize('track_eta_max', track_eta_max)
+        if self._track_eta_max is None: # None value not allowed in this case, so set to 0.9
+            self.logger.warning("Maximum track eta unspecified; setting to 0.9.")
+            self._track_eta_max = 0.9
+
+    def _pass_iso_pt_max(self, cluster, tracks, verbose = False):
+        """Determine if cluster is isolated."""
+        iso_cone_pt = np.sum([t.pt() for t in tracks if cluster.dR(t) < self._iso_cone_R])
+        facc = self._facc(cluster.eta())
+        iso_cone_pt_eff = iso_cone_pt * facc
+        if not verbose:
+            return iso_cone_pt_eff < self._iso_pt_max
+        else:
+            return iso_cone_pt_eff < self._iso_pt_max, iso_cone_pt_eff, facc, iso_cone_pt
+
+    def apply_to(self, df):
+        raise NotImplementedError("Isolation criteria can't be applied to RDataFrames!")
+
+    def _facc(self, eta):
+        """Calculate the geometric acceptance correction for clusters whose isolation cones extend outside the central barrel acceptance."""
+        eta = abs(eta)
+        if eta < (self._track_eta_max - self._iso_cone_R):
+            return 1.0
+        else:
+            return np.pi * self._iso_cone_R * self._iso_cone_R / ((self._track_eta_max - eta) * np.sqrt(self._iso_cone_R**2 - (self._track_eta_max - eta)**2 ) + np.pi * self._iso_cone_R**2 * (1 - 1 / np.pi * np.arccos((self._track_eta_max - eta) / self._iso_cone_R)))
+
+class AnalysisSelector:
+    def __init__(self, file):
+        self.logger = set_up_logger(type(self).__name__)
+        self.event = EventSelector.load(file)
+        self.track = TrackSelector.load(file)
+        self.cluster = ClusterSelector.load(file)
+        self.isolation = IsolationSelector.load(file)
+
+    @classmethod
+    def load(cls, file):
+        return cls(file)
+
+    def dump(self):
+        self.logger.info("Selector configurations:")
+        for selector in self.active:
+            getattr(self, selector).dump()
+        self.logger.info(f"Inactive selectors: {self.inactive}")
+
+    @property
+    def active(self):
+        """Return a list of active selectors."""
+        selectors = ["event", "track", "cluster", "isolation"]
+        return [selector for selector in selectors if self.is_active(selector)]
+
+    @property
+    def inactive(self):
+        """Return a list of inactive selectors."""
+        selectors = ["event", "track", "cluster", "isolation"]
+        return [selector for selector in selectors if self.is_inactive(selector)]
+
+    def is_active(self, selector: str):
+        return getattr(self, selector) is not None
+    def is_inactive(self, selector: str):
+        return getattr(self, selector) is None
\ No newline at end of file
diff --git a/alian/analysis/base/utils.py b/alian/analysis/base/utils.py
new file mode 100644
index 0000000..650aa5d
--- /dev/null
+++ b/alian/analysis/base/utils.py
@@ -0,0 +1,61 @@
+import os
+from collections import defaultdict
+
+import numpy as np
+import yaml
+
+
+def read_yaml(f):
+    with open(f) as stream:
+        cfg = yaml.safe_load(stream)
+    return cfg
+
+def nested_dict():
+    return defaultdict(nested_dict)
+
+ndict = nested_dict
+
+def dict_to_ndict(d):
+    ndict = nested_dict()
+    for k, v in d.items():
+        if isinstance(v, dict):
+            ndict[k] = dict_to_ndict(v)
+        else:
+            ndict[k] = v
+    return ndict
+
+def ndict_to_dict(d):
+    for k, v in d.items():
+        if isinstance(v, dict):
+            d[k] = ndict_to_dict(v)
+    return dict(d)
+
+def linbins(xmin, xmax, nbins):
+    return np.linspace(xmin, xmax, nbins + 1)
+
+def logbins(xmin, xmax, nbins):
+    return np.logspace(np.log10(xmin), np.log10(xmax), nbins + 1)
+
+def delta_R(p1, p2):
+    """Calculate angular distance with pseudorapidity (rather than rapidity)."""
+    return np.sqrt(p1.delta_phi_to(p2) ** 2 + (p1.eta() - p2.eta()) ** 2)
+
+def delta_phi_limit(dphi):
+    """Limit a given angle between -pi and pi."""
+    while dphi > np.pi:
+        dphi -= 2 * np.pi
+    while dphi < -np.pi:
+        dphi += 2 * np.pi
+    return dphi
+
+def is_slurm():
+    """Check if we are in a Slurm job / on a compute node."""
+    if "SLURM_SUBMIT_DIR" in os.environ or 'HOST' not in os.environ:
+        # check for Slurm environment variables or an unset HOST variable
+        return True
+    elif 'login' in os.environ['HOST']:
+        # check for login node for HOST
+        return False
+    else:
+        # if unparsable, default to Slurm configuration (safer)
+        return True
\ No newline at end of file

From af2f41d19fc9dc69932ec2f2d0efb1044931313f Mon Sep 17 00:00:00 2001
From: Tucker Hwang <tucker_hwang@berkeley.edu>
Date: Mon, 13 Apr 2026 23:00:45 -0700
Subject: [PATCH 2/3] Update Run 3 data reader

---
 alian/config/run3_tstruct.yaml |  46 +++---
 alian/io/data_io.py            |  14 +-
 alian/src/fjutil/fjutil.cxx    | 285 ++++++++++++++++++++++++++++++++-
 alian/src/fjutil/fjutil.hh     | 153 ++++++++++++++++++
 alian/utils/data_fj.py         |  18 ++-
 5 files changed, 483 insertions(+), 33 deletions(-)

diff --git a/alian/config/run3_tstruct.yaml b/alian/config/run3_tstruct.yaml
index 9a7b6b6..1cc1045 100644
--- a/alian/config/run3_tstruct.yaml
+++ b/alian/config/run3_tstruct.yaml
@@ -1,24 +1,30 @@
 eventTree:
   branches:
   - run_number
-  - event_selection
-  - triggersel
-  - centrality
   - multiplicity
-  - track_data_eta
-  - track_data_phi
-  - track_data_pt
-  - track_data_label
-  - track_data_tracksel
-#  - cluster_data_energy
-#  - cluster_data_eta
-#  - cluster_data_phi
-#  - cluster_data_m02
-#  - cluster_data_m20
-#  - cluster_data_ncells
-#  - cluster_data_time
-#  - cluster_data_isexotic
-#  - cluster_data_distancebadchannel
-#  - cluster_data_nlm
-#  - cluster_data_clusterdef
-#  - cluster_data_matchedTrackIndex
+  - centrality
+  - occupancy
+  - event_sel
+  - trig_sel
+  - rct
+  - track_pt
+  - track_eta
+  - track_phi
+  - track_sel
+  # - cluster_energy
+  # - cluster_eta
+  # - cluster_phi
+  # - cluster_m02
+  # - cluster_m20
+  # - cluster_ncells
+  # - cluster_time
+  # - cluster_exoticity
+  # - cluster_dbc
+  # - cluster_nlm
+  # - cluster_defn
+  # - cluster_matched_track_n
+  # - cluster_matched_track_delta_eta
+  # - cluster_matched_track_delta_phi
+  # - cluster_matched_track_p
+  # - cluster_matched_track_pt
+  # - cluster_matched_track_sel
diff --git a/alian/io/data_io.py b/alian/io/data_io.py
index a943cbe..cbbdf0e 100644
--- a/alian/io/data_io.py
+++ b/alian/io/data_io.py
@@ -34,7 +34,7 @@ def __init__(self, file_list, yaml_file=None, tree_name=None, branches=None, **k
     def add_generic_ebye_info(self):
         self.event.multiplicity = self.event.data['multiplicity']
         self.event.centrality = self.event.data['centrality']
-        self.event.track_count = len(self.event.data['track_data_pt'])
+        self.event.track_count = len(self.event.data['track_pt'])
         self.event.counter = self.event_count
 
     # Efficiently iterate over the tree as a generator
@@ -175,7 +175,15 @@ def next_event(self):
                     break
 
 
-def get_default_tree_structure(lhc_run=3):
+def get_default_tree_structure(file_list, lhc_run=3):
+    if isinstance(file_list, str):
+        if file_list.endswith(".txt"):
+            with open(file_list, "r") as file:
+                filename = file.read().splitlines()
+        else:
+            filename = file_list
+    if os.path.isfile(_guess_path := os.path.join(os.path.dirname(os.path.abspath(filename)), "../tstruct.yaml")):
+        return _guess_path
     _alian_path_guess = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../')
     _default_tree_structure = os.path.join(_alian_path_guess, f'config/run{lhc_run}_tstruct.yaml')
     _alian_path = yasp.features('prefix', 'alian')
@@ -195,7 +203,7 @@ def __init__(self, file_list, lhc_run=None, yaml_file=None, tree_name=None, bran
         self.tree_name = tree_name
         self.branches = branches
         if self.yaml_file is None:
-            self.yaml_file = get_default_tree_structure(lhc_run=self.lhc_run)
+            self.yaml_file = get_default_tree_structure(file_list = file_list, lhc_run=self.lhc_run)
         _data_input = Run2FileInput if lhc_run == 2 else Run3FileInput
         self.data_input = _data_input(file_list, yaml_file=self.yaml_file, tree_name=self.tree_name, branches=self.branches, **kwargs)
 
diff --git a/alian/src/fjutil/fjutil.cxx b/alian/src/fjutil/fjutil.cxx
index 5933e82..e3c8795 100644
--- a/alian/src/fjutil/fjutil.cxx
+++ b/alian/src/fjutil/fjutil.cxx
@@ -1,7 +1,7 @@
 // fjutil.cxx
 #include "fjutil.hh"
 #include <iostream>
-
+#include <numeric>
 namespace alian
 {
 	void process_numpy_array(PyObject *array)
@@ -102,7 +102,288 @@ namespace alian
 			particles.emplace_back(px, py, pz, E);
 			particles.back().set_user_index(i + index_offset);
 		}
-
 		return particles;
 	}
+
+
+	// function to transform numpy arrays (px, py, pz, tracksel) into a vector of PseudoJets with track information
+	std::vector<fastjet::PseudoJet> numpy_pxpypz_to_tracks(PyObject *px, PyObject *py, PyObject *pz, PyObject *tracksel, int index_offset)
+	{
+		PyArrayObject *np_px = reinterpret_cast<PyArrayObject *>(px);
+		PyArrayObject *np_py = reinterpret_cast<PyArrayObject *>(py);
+		PyArrayObject *np_pz = reinterpret_cast<PyArrayObject *>(pz);
+		PyArrayObject *np_tracksel = reinterpret_cast<PyArrayObject *>(tracksel);
+
+		if (PyArray_NDIM(np_px) != 1 || PyArray_NDIM(np_py) != 1 || PyArray_NDIM(np_pz) != 1 || PyArray_NDIM(np_tracksel) != 1)
+		{
+			PyErr_SetString(PyExc_TypeError, "Arrays must be one-dimensional");
+			return std::vector<fastjet::PseudoJet>();
+		}
+
+		npy_intp size = PyArray_SIZE(np_px);
+		if (size != PyArray_SIZE(np_py) || size != PyArray_SIZE(np_pz) || size != PyArray_SIZE(np_tracksel))
+		{
+			PyErr_SetString(PyExc_ValueError, "Arrays must have the same size");
+			return std::vector<fastjet::PseudoJet>();
+		}
+
+		float *px_data = static_cast<float *>(PyArray_DATA(np_px));
+		float *py_data = static_cast<float *>(PyArray_DATA(np_py));
+		float *pz_data = static_cast<float *>(PyArray_DATA(np_pz));
+		uint8_t *tracksel_data = static_cast<uint8_t *>(PyArray_DATA(np_tracksel));
+
+		std::vector<fastjet::PseudoJet> tracks;
+		tracks.reserve(size);
+
+		for (npy_intp i = 0; i < size; ++i)
+		{
+			// assume the charged pion mass
+			double E = sqrt(px_data[i] * px_data[i] + py_data[i] * py_data[i] + pz_data[i] * pz_data[i] + PION_MASS * PION_MASS);
+			// charge is stored in the 0th bit of tracksel (1 = positive, 0 = negativ)
+			short q = tracksel_data[i] & 0b1 ? 1 : -1;
+
+			tracks.emplace_back(px_data[i], py_data[i], pz_data[i], E);
+			tracks.back().set_user_info(new alian::TrackInfo(q, tracksel_data[i]));
+			tracks.back().set_user_index(i + index_offset);
+		}
+
+		return tracks;
+	}
+
+	// function to transform numpy arrays (pT, eta, phi, tracksel) into a vector of PseudoJets with track information
+	std::vector<fastjet::PseudoJet> numpy_ptetaphi_to_tracks(PyObject *pt, PyObject *eta, PyObject *phi, PyObject *tracksel, int index_offset)
+	{
+		PyArrayObject *np_pt = reinterpret_cast<PyArrayObject *>(pt);
+		PyArrayObject *np_eta = reinterpret_cast<PyArrayObject *>(eta);
+		PyArrayObject *np_phi = reinterpret_cast<PyArrayObject *>(phi);
+		PyArrayObject *np_tracksel = reinterpret_cast<PyArrayObject *>(tracksel);
+
+		if (PyArray_NDIM(np_pt) != 1 || PyArray_NDIM(np_eta) != 1 || PyArray_NDIM(np_phi) != 1 || PyArray_NDIM(np_tracksel) != 1)
+		{
+			PyErr_SetString(PyExc_TypeError, "Arrays must be one-dimensional");
+			return std::vector<fastjet::PseudoJet>();
+		}
+
+		npy_intp size = PyArray_SIZE(np_pt);
+		if (size != PyArray_SIZE(np_eta) || size != PyArray_SIZE(np_phi) || size  != PyArray_SIZE(np_tracksel))
+		{
+			PyErr_SetString(PyExc_ValueError, "Arrays must have the same size");
+			return std::vector<fastjet::PseudoJet>();
+		}
+
+		float *pt_data  = static_cast<float *>(PyArray_DATA(np_pt));
+		float *eta_data = static_cast<float *>(PyArray_DATA(np_eta));
+		float *phi_data = static_cast<float *>(PyArray_DATA(np_phi));
+		uint8_t *tracksel_data = static_cast<uint8_t *>(PyArray_DATA(np_tracksel));
+
+		std::vector<fastjet::PseudoJet> tracks;
+		tracks.reserve(size);
+
+		for (npy_intp i = 0; i < size; ++i)
+		{
+			if (pt_data[i] <= 0.001) // this is 1 MeV (!)
+			{
+				// PyErr_SetString(PyExc_ValueError, "pt must be positive");
+				return std::vector<fastjet::PseudoJet>();
+				continue;
+			}
+			double px = pt_data[i] * cos(phi_data[i]);
+			double py = pt_data[i] * sin(phi_data[i]);
+			double pz = pt_data[i] * sinh(eta_data[i]);
+			// assume the pion mass
+			double E = sqrt(px * px + py * py + pz * pz + PION_MASS * PION_MASS);
+			// charge is stored in the 0th bit of tracksel (1 = positive, 0 = negative)
+			short q = tracksel_data[i] & 0b1 ? 1 : -1;
+
+			tracks.emplace_back(px, py, pz, E);
+			tracks.back().set_user_info(new alian::TrackInfo(q, tracksel_data[i]));
+			tracks.back().set_user_index(i + index_offset);
+		}
+
+		return tracks;
+	}
+
+	std::vector<Cluster> numpy_energyetaphi_to_clusters(
+		PyObject *energy, PyObject *eta, PyObject *phi,
+		PyObject *m02,
+		PyObject *m20,
+		PyObject *ncells,
+		PyObject *time,
+		PyObject *exoticity,
+		PyObject *dbc,
+		PyObject *nlm,
+		PyObject *defn,
+		PyObject *matchedTrackN,
+		PyObject *matchedTrackDeltaEta,
+		PyObject *matchedTrackDeltaPhi,
+		PyObject *matchedTrackP,
+		PyObject *matchedTrackPt,
+		PyObject *matchedTrackSel,
+		int index_offset
+	)
+	{
+		PyArrayObject *np_energy = reinterpret_cast<PyArrayObject *>(energy);
+		PyArrayObject *np_eta = reinterpret_cast<PyArrayObject *>(eta);
+		PyArrayObject *np_phi = reinterpret_cast<PyArrayObject *>(phi);
+		PyArrayObject *np_m02 = reinterpret_cast<PyArrayObject *>(m02);
+		PyArrayObject *np_m20 = reinterpret_cast<PyArrayObject *>(m20);
+		PyArrayObject *np_ncells = reinterpret_cast<PyArrayObject *>(ncells);
+		PyArrayObject *np_time = reinterpret_cast<PyArrayObject *>(time);
+		PyArrayObject *np_exoticity = reinterpret_cast<PyArrayObject *>(exoticity);
+		PyArrayObject *np_dbc = reinterpret_cast<PyArrayObject *>(dbc);
+		PyArrayObject *np_nlm = reinterpret_cast<PyArrayObject *>(nlm);
+		PyArrayObject *np_defn = reinterpret_cast<PyArrayObject *>(defn);
+		PyArrayObject *np_matchedTrackN = reinterpret_cast<PyArrayObject *>(matchedTrackN);
+		PyArrayObject *np_matchedTrackDeltaEta = reinterpret_cast<PyArrayObject *>(matchedTrackDeltaEta);
+		PyArrayObject *np_matchedTrackDeltaPhi = reinterpret_cast<PyArrayObject *>(matchedTrackDeltaPhi);
+		PyArrayObject *np_matchedTrackP = reinterpret_cast<PyArrayObject *>(matchedTrackP);
+		PyArrayObject *np_matchedTrackPt = reinterpret_cast<PyArrayObject *>(matchedTrackPt);
+		PyArrayObject *np_matchedTrackSel = reinterpret_cast<PyArrayObject *>(matchedTrackSel);
+
+		if (PyArray_NDIM(np_energy) != 1
+		|| PyArray_NDIM(np_eta) != 1
+		|| PyArray_NDIM(np_phi) != 1
+		|| PyArray_NDIM(np_m02) != 1
+		|| PyArray_NDIM(np_m02) != 1
+		|| PyArray_NDIM(np_m20) != 1
+		|| PyArray_NDIM(np_ncells) != 1
+		|| PyArray_NDIM(np_time) != 1
+		|| PyArray_NDIM(np_exoticity) != 1
+		|| PyArray_NDIM(np_dbc) != 1
+		|| PyArray_NDIM(np_nlm) != 1
+		|| PyArray_NDIM(np_defn) != 1
+		|| PyArray_NDIM(np_matchedTrackN) != 1
+		|| PyArray_NDIM(np_matchedTrackDeltaEta) != 1
+		|| PyArray_NDIM(np_matchedTrackDeltaPhi) != 1
+		|| PyArray_NDIM(np_matchedTrackP) != 1
+		|| PyArray_NDIM(np_matchedTrackPt) != 1
+		|| PyArray_NDIM(np_matchedTrackSel) != 1
+			)
+		{
+			PyErr_SetString(PyExc_TypeError, "Arrays must be one-dimensional");
+			return std::vector<alian::Cluster>();
+		}
+
+		npy_intp size = PyArray_SIZE(np_energy);
+		if (size != PyArray_SIZE(np_eta)
+			|| size != PyArray_SIZE(np_phi)
+			|| size != PyArray_SIZE(np_m02)
+			|| size != PyArray_SIZE(np_m02)
+			|| size != PyArray_SIZE(np_m20)
+			|| size != PyArray_SIZE(np_ncells)
+			|| size != PyArray_SIZE(np_time)
+			|| size != PyArray_SIZE(np_exoticity)
+			|| size != PyArray_SIZE(np_dbc)
+			|| size != PyArray_SIZE(np_nlm)
+			|| size != PyArray_SIZE(np_defn)
+			|| size != PyArray_SIZE(np_matchedTrackN)
+		)
+		{
+			PyErr_SetString(PyExc_ValueError, "Arrays must have the same size");
+			return std::vector<alian::Cluster>();
+		}
+
+		// if no clusters in the event, return empty vector
+		if (size == 0) {
+			return std::vector<alian::Cluster>();
+		}
+
+		// Data types here have to match those from the NumPy arrays
+		// These are taken from the types that uproot reads from the BerkeleyTrees
+		float *energy_data   = static_cast<float*> (PyArray_DATA(np_energy));
+		float *eta_data      = static_cast<float*> (PyArray_DATA(np_eta));
+		float *phi_data      = static_cast<float*> (PyArray_DATA(np_phi));
+		float *m02_data      = static_cast<float*> (PyArray_DATA(np_m02));
+		float *m20_data      = static_cast<float*> (PyArray_DATA(np_m20));
+		int *ncells_data     = static_cast<int*>   (PyArray_DATA(np_ncells));
+		float *time_data     = static_cast<float*> (PyArray_DATA(np_time));
+		bool *exoticity_data = static_cast<bool*>  (PyArray_DATA(np_exoticity));
+		float *dbc_data      = static_cast<float*> (PyArray_DATA(np_dbc));
+		int *nlm_data        = static_cast<int*>   (PyArray_DATA(np_nlm));
+		int *defn_data       = static_cast<int*>   (PyArray_DATA(np_defn));
+		int *matchedTrackN_data  = static_cast<int*> (PyArray_DATA(np_matchedTrackN));
+		// float *matchedTrackEta_data  = static_cast<float*> (PyArray_DATA(np_matchedTrackEta));
+		// float *matchedTrackPhi_data  = static_cast<float*> (PyArray_DATA(np_matchedTrackPhi));
+		// float *matchedTrackP_data  = static_cast<float*> (PyArray_DATA(np_matchedTrackP));
+
+		npy_intp sizeMatchedDeltaEta = PyArray_SIZE(np_matchedTrackDeltaEta);
+		npy_intp sizeMatchedDeltaPhi = PyArray_SIZE(np_matchedTrackDeltaPhi);
+		npy_intp sizeMatchedP = PyArray_SIZE(np_matchedTrackP);
+		npy_intp sizeMatchedPt = PyArray_SIZE(np_matchedTrackPt);
+		npy_intp sizeMatchedSel = PyArray_SIZE(np_matchedTrackSel);
+
+		uint16_t nMatchedTracks;
+		for (npy_intp i = 0; i < size; ++i) {
+			nMatchedTracks += matchedTrackN_data[i];
+		}
+
+		if ( nMatchedTracks != sizeMatchedDeltaEta
+			|| nMatchedTracks != sizeMatchedDeltaPhi
+			|| nMatchedTracks != sizeMatchedP
+			|| nMatchedTracks != sizeMatchedPt
+			|| nMatchedTracks != sizeMatchedSel)
+		{
+			PyErr_SetString(PyExc_ValueError, "Number of matched tracks does not fit");
+			return std::vector<alian::Cluster>();
+		}
+
+		std::vector<alian::Cluster> clusters;
+		clusters.reserve(size);
+
+		int iMatchedTrack(0);
+		for (npy_intp i = 0; i < size; ++i)
+		{
+			// assume massless in this calculation
+			double pt = energy_data[i] / cosh(eta_data[i]);
+			// double pz = energy_data[i] * tanh(eta_data[i]);
+			// if (pt <= 0.001) // this is 1 MeV (!)
+			// {
+			// 	// PyErr_SetString(PyExc_ValueError, "pt must be positive");
+			// 	return std::vector<alian::Cluster>();
+			// 	continue;
+			// }
+			double px = pt * cos(phi_data[i]);
+			double py = pt * sin(phi_data[i]);
+			double pz = pt * sinh(eta_data[i]);
+
+			npy_intp const dims[1] = {matchedTrackN_data[i]};
+			PyArrayObject* matchedTrackDeltaEta = reinterpret_cast<PyArrayObject *>(PyArray_SimpleNew(1, dims, NPY_FLOAT));
+			PyArrayObject* matchedTrackDeltaPhi = reinterpret_cast<PyArrayObject *>(PyArray_SimpleNew(1, dims, NPY_FLOAT));
+			PyArrayObject* matchedTrackP = reinterpret_cast<PyArrayObject *>(PyArray_SimpleNew(1, dims, NPY_FLOAT));
+			PyArrayObject* matchedTrackPt = reinterpret_cast<PyArrayObject *>(PyArray_SimpleNew(1, dims, NPY_FLOAT));
+			PyArrayObject* matchedTrackSel = reinterpret_cast<PyArrayObject *>(PyArray_SimpleNew(1, dims, NPY_UINT8));
+			
+			for (int iTrack = 0; iTrack < matchedTrackN_data[i]; ++iTrack) {
+				*(npy_float *)PyArray_GETPTR1(matchedTrackDeltaEta, iTrack) = *(npy_float *)PyArray_GETPTR1(np_matchedTrackDeltaEta, iMatchedTrack);
+				*(npy_float *)PyArray_GETPTR1(matchedTrackDeltaPhi, iTrack) = *(npy_float *)PyArray_GETPTR1(np_matchedTrackDeltaPhi, iMatchedTrack);
+				*(npy_float *)PyArray_GETPTR1(matchedTrackP, iTrack) = *(npy_float *)PyArray_GETPTR1(np_matchedTrackP, iMatchedTrack);
+				*(npy_float *)PyArray_GETPTR1(matchedTrackPt, iTrack) = *(npy_float *)PyArray_GETPTR1(np_matchedTrackPt, iMatchedTrack);
+				*(npy_uint8 *)PyArray_GETPTR1(matchedTrackSel, iTrack) = *(npy_uint8 *)PyArray_GETPTR1(np_matchedTrackSel, iMatchedTrack);
+				iMatchedTrack++;
+			}
+			clusters.emplace_back(px, py, pz,
+				energy_data[i],
+				m02_data[i],
+				m20_data[i],
+				ncells_data[i],
+				time_data[i],
+				exoticity_data[i],
+				dbc_data[i],
+				nlm_data[i],
+				defn_data[i],
+				matchedTrackN_data[i],
+				(PyObject*)matchedTrackDeltaEta,
+				(PyObject*)matchedTrackDeltaPhi,
+				(PyObject*)matchedTrackP,
+				(PyObject*)matchedTrackPt,
+				(PyObject*)matchedTrackSel
+			);
+			clusters.back().set_user_index(i + index_offset);
+		}
+		if (iMatchedTrack != nMatchedTracks) {
+			PyErr_SetString(PyExc_ValueError, "Somehow did not consume all matched tracks");
+			return std::vector<alian::Cluster>();
+		}
+		return clusters;
+	}
 }
\ No newline at end of file
diff --git a/alian/src/fjutil/fjutil.hh b/alian/src/fjutil/fjutil.hh
index 1798c6c..78932f7 100644
--- a/alian/src/fjutil/fjutil.hh
+++ b/alian/src/fjutil/fjutil.hh
@@ -43,6 +43,159 @@ namespace alian
 
 	std::vector<fastjet::PseudoJet> numpy_pxpypz_to_pseudojets(PyObject *px, PyObject *py, PyObject *pz, double m, int index_offset = 0);
 	std::vector<fastjet::PseudoJet> numpy_ptetaphi_to_pseudojets(PyObject *pt, PyObject *eta, PyObject *phi, double m, int index_offset = 0);
+
+	constexpr double PION_MASS = 0.1395704;
+
+	std::vector<fastjet::PseudoJet> numpy_pxpypz_to_tracks(PyObject *px, PyObject *py, PyObject *pz, PyObject *tracksel, int index_offset);
+	std::vector<fastjet::PseudoJet> numpy_ptetaphi_to_tracks(PyObject *pt, PyObject *eta, PyObject *phi, PyObject *tracksel, int index_offset);
+
+	class TrackInfo : public fastjet::PseudoJet::UserInfoBase
+	{
+	public:
+		TrackInfo(short q, uint16_t track_sel) : _q(q), _track_sel(track_sel) {}
+
+		const short q() const { return _q; }
+		const uint16_t track_sel() const { return _track_sel; }
+
+	private:
+		short _q;
+		uint16_t _track_sel;
+	};
+
+	class Cluster : public fastjet::PseudoJet
+	{
+	public:
+		Cluster(
+			const double px_in,
+			const double py_in,
+			const double pz_in,
+			const double E_in,
+			const float m02,
+			const float m20,
+			const int ncells,
+			const float time,
+			const bool exoticity,
+			const float dbc,
+			const int nlm,
+			const int defn,
+			const int matchedTrackN,
+			PyObject* matchedTrackDeltaEta,
+			PyObject* matchedTrackDeltaPhi,
+			PyObject* matchedTrackP,
+			PyObject* matchedTrackPt,
+			PyObject* matchedTrackSel
+		) : fastjet::PseudoJet(px_in, py_in, pz_in, E_in),
+		_m02(m02),
+		_m20(m20),
+		_ncells(ncells),
+		_time(time),
+		_exoticity(exoticity),
+		_dbc(dbc),
+		_nlm(nlm),
+		_defn(defn),
+		_matchedTrackN(matchedTrackN),
+		_matchedTrackDeltaEta(matchedTrackDeltaEta),
+		_matchedTrackDeltaPhi(matchedTrackDeltaPhi),
+		_matchedTrackP(matchedTrackP),
+		_matchedTrackPt(matchedTrackPt),
+		_matchedTrackSel(matchedTrackSel)
+		{
+			Py_INCREF(_matchedTrackDeltaEta);
+			Py_INCREF(_matchedTrackDeltaPhi);
+			Py_INCREF(_matchedTrackP);
+			Py_INCREF(_matchedTrackPt);
+			Py_INCREF(_matchedTrackSel);
+		}
+
+		~Cluster() {
+			Py_DECREF(_matchedTrackDeltaEta);
+			Py_DECREF(_matchedTrackDeltaPhi);
+			Py_DECREF(_matchedTrackP);
+			Py_DECREF(_matchedTrackPt);
+			Py_DECREF(_matchedTrackSel);
+		}
+		template <typename T>
+		const double delta_eta(const T &other) const { return other.eta() - eta(); };
+		template <typename T>
+		const double abs_delta_eta(const T &other) const { return fabs(other.eta() - eta()); };
+		template <typename T>
+		const double abs_delta_phi(const T &other) const { return fabs(delta_phi_to(other)); };
+		template <typename T>
+		const double dR(const T &other) const {
+			double dphi = delta_phi_to(other);
+			double deta = delta_eta(other);
+			return sqrt(dphi*dphi + deta*deta);
+		};
+		inline const auto energy() const { return e(); };
+		inline const auto m02() const { return _m02; };
+		inline const auto m20() const { return _m20; };
+		inline const auto ncells() const { return _ncells; };
+		inline const auto time() const { return _time; };
+		inline const auto exoticity() const { return _exoticity; };
+		inline const auto dbc() const { return _dbc; };
+		inline const auto nlm() const { return _nlm; };
+		inline const auto defn() const { return _defn; };
+		inline const auto matchedTrackN() const { return _matchedTrackN; };
+		inline auto matchedTrackDeltaEta() const { Py_INCREF(_matchedTrackDeltaEta); return _matchedTrackDeltaEta; };
+		inline auto matchedTrackDeltaPhi() const { Py_INCREF(_matchedTrackDeltaPhi); return _matchedTrackDeltaPhi; };
+		inline auto matchedTrackP() const { Py_INCREF(_matchedTrackP); return _matchedTrackP; };
+		inline auto matchedTrackPt() const { Py_INCREF(_matchedTrackPt); return _matchedTrackPt; };
+		inline auto matchedTrackSel() const { Py_INCREF(_matchedTrackSel); return _matchedTrackSel; };
+		// Copy constructor
+		// Cluster(const Cluster& other)
+		// 	: _m02(other.m02()),
+		// 	_m20(other.m20()),
+		// 	_ncells(other.ncells()),
+		// 	_time(other.time()),
+		// 	_exoticity(other.exoticity()),
+		// 	_dbc(other.dbc()),
+		// 	_nlm(other.nlm()),
+		// 	_defn(other.defn()),
+		// 	_matchedTrackN(other.matchedTrackN()),
+		// 	_matchedTrackEta(other.matchedTrackEta()),
+		// 	_matchedTrackPhi(other.matchedTrackPhi()),
+		// 	_matchedTrackP(other.matchedTrackP()) {
+		// 		Py_DECREF(_matchedTrackEta);
+		// 		// Py_DECREF(_matchedTrackEta);
+		// 		Py_DECREF(_matchedTrackPhi);
+		// 		Py_DECREF(_matchedTrackP);
+		// 	}
+
+		private:
+			float _m02;
+			float _m20;
+			int _ncells;
+			float _time;
+			bool _exoticity;
+			float _dbc;
+			int _nlm;
+			int _defn;
+			int _matchedTrackN;
+			PyObject* _matchedTrackDeltaEta;
+			PyObject* _matchedTrackDeltaPhi;
+			PyObject* _matchedTrackP;
+			PyObject* _matchedTrackPt;
+			PyObject* _matchedTrackSel;
+	};
+
+	std::vector<Cluster> numpy_energyetaphi_to_clusters(
+		PyObject *energy, PyObject *eta, PyObject *phi,
+		PyObject *m02,
+		PyObject *m20,
+		PyObject *ncells,
+		PyObject *time,
+		PyObject *exoticity,
+		PyObject *dbc,
+		PyObject *nlm,
+		PyObject *defn,
+		PyObject *matchedTrackN,
+		PyObject *matchedTrackDeltaEta,
+		PyObject *matchedTrackDeltaPhi,
+		PyObject *matchedTrackP,
+		PyObject *matchedTrackPt,
+		PyObject *matchedTrackSel,
+		int index_offset
+	);
 }
 
 #endif
\ No newline at end of file
diff --git a/alian/utils/data_fj.py b/alian/utils/data_fj.py
index aba2f71..3e7d617 100644
--- a/alian/utils/data_fj.py
+++ b/alian/utils/data_fj.py
@@ -24,6 +24,9 @@ def psjv_from_tracks_run3(event, index_offset=0, m=0.13957):
     rv = alian.numpy_ptetaphi_to_pseudojets(event.data['track_data_pt'], event.data['track_data_eta'], event.data['track_data_phi'], m, index_offset)
     return rv
 
+def psjv_from_tracks_run3_selected(pt, eta, phi, m, index_offset):
+    return alian.numpy_ptetaphi_to_pseudojets(pt, eta, phi, m, index_offset)
+
 def psjv_from_tracks_run2(event, index_offset=0, m=0.13957):
     rv = std.vector[fj.PseudoJet]()
     for i in range(len(event.data['ParticlePt'])):
@@ -39,11 +42,10 @@ def psjv_from_tracks_run2(event, index_offset=0, m=0.13957):
     return rv
 
 def data_tracks_to_pseudojets(event, index_offset=0, m=0.13957, lhc_run=3):
-		if lhc_run == 3:
-				return psjv_from_tracks_run3(event, index_offset, m)
-		elif lhc_run == 2:
-				return psjv_from_tracks_run2(event, index_offset, m)
-		else:
-				raise ValueError(f'Invalid run number: {lhc_run}')
-		return None
-  
+    if lhc_run == 3:
+        return psjv_from_tracks_run3(event, index_offset, m)
+    elif lhc_run == 2:
+        return psjv_from_tracks_run2(event, index_offset, m)
+    else:
+        raise ValueError(f'Invalid run number: {lhc_run}')
+    return None

From 4d9b94f87b8c85ffbf16f8bf6edf70eb9b3d563a Mon Sep 17 00:00:00 2001
From: Tucker Hwang <tucker_hwang@berkeley.edu>
Date: Tue, 14 Apr 2026 07:35:09 -0700
Subject: [PATCH 3/3] Add example analyses

---
 alian/analysis/example/example_analysis.py |  80 ++++++++++++
 alian/analysis/example/example_rdf.py      | 142 +++++++++++++++++++++
 alian/analysis/example/merge.sh            | 100 +++++++++++++++
 alian/config/example/example_analysis.yaml |  65 ++++++++++
 alian/config/example/example_rdf.yaml      |  25 ++++
 5 files changed, 412 insertions(+)
 create mode 100755 alian/analysis/example/example_analysis.py
 create mode 100755 alian/analysis/example/example_rdf.py
 create mode 100755 alian/analysis/example/merge.sh
 create mode 100644 alian/config/example/example_analysis.yaml
 create mode 100644 alian/config/example/example_rdf.yaml

diff --git a/alian/analysis/example/example_analysis.py b/alian/analysis/example/example_analysis.py
new file mode 100755
index 0000000..cc55732
--- /dev/null
+++ b/alian/analysis/example/example_analysis.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+
+"""Example script on using the alian framework for Run 3 analysis.
+
+This script is used on the ROOT file (provided as an argument) to
+perform the analysis of the events containing isolated photons and
+jets. The analysis is configured with a config YAML file, creates
+and fills histograms for the cleaned data, and stores them in the
+output file.
+
+To be used with alian/config/example/example_analysis.yaml
+"""
+
+import argparse
+import itertools
+
+import numpy as np
+from alian.analysis.base import AnalysisBase, add_default_args, delta_R
+
+import heppyy
+
+fj = heppyy.load_cppyy('fastjet')
+
+
+class AnalysisExample(AnalysisBase):
+    _defaults = {
+        'photon_jet_angle_min': 0.875 * np.pi,
+        'pt_min_eec': 1.0,
+    }
+    def init_analysis(self, analysis_cfg: dict):
+        config = self._defaults | analysis_cfg
+        for setting, value in config.items():
+            setattr(self, setting, value)
+        self.eec_trk_selector = fj.SelectorPtMin(self.pt_min_eec)
+
+    def analyze_event(self):
+        # Analyzes this event that has passed the selection criteria
+        # self.event contains the selected event
+        # self.tracks contains selected tracks (i.e. after selection cuts)
+        # self.clusters contains selected clusters (i.e. after selection cuts)
+        # self.jets contains selected jets
+
+        [self.hists['track_pT'].Fill(t.pt()) for t in self.tracks]
+        [self.hists['jet_pT'].Fill(j.pt()) for j in self.jets]
+        [self.hists['jet_pT_coarse'].Fill(j.pt()) for j in self.jets]
+
+        for c in self.clusters:
+            is_iso, iso_pt, _, _ = self.selector.isolation.selects(c, self.tracks, verbose = True)
+            self.hists['tot_iso_pT'].Fill(iso_pt)
+            if is_iso:
+                self.hists['photon_E'].Fill(c.e())
+                # find back-to-back jets
+                for j in self.jets:
+                    dphi = np.abs(j.delta_phi_to(c))
+                    if dphi > self.photon_jet_angle_min:
+                        self.hists["jet_photon_dphi"].Fill(dphi)
+                        self.hists["y_jet_pT"].Fill(j.pt())
+                        self.hists["xjy"].Fill(j.pt() / c.e(), c.e())
+                        self.do_eec(j)
+
+    def do_eec(self, jet):
+        tracks = self.eec_trk_selector(jet.constituents())
+        for p1, p2 in itertools.permutations(tracks, 2):
+            ew = p1.pt() * p2.pt() / jet.pt() / jet.pt()
+            angle = delta_R(p1, p2)
+
+            self.hists["eec"].Fill(jet.pt(), angle, ew)
+
+    def finalize(self):
+        self.hists['track_pT'].Scale(1, "width")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Run analysis on ROOT file using YAML configuration.")
+    parser = add_default_args(parser)
+
+    args = parser.parse_args()
+
+    ana = AnalysisExample(args.input_file, args.output_file, args.config_file, args.tree_struct, args.nev, args.lhc_run)
+    ana.run()
diff --git a/alian/analysis/example/example_rdf.py b/alian/analysis/example/example_rdf.py
new file mode 100755
index 0000000..fe90995
--- /dev/null
+++ b/alian/analysis/example/example_rdf.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+
+"""Example script on using RDataFrame for analysis.
+
+An example script that uses RDataFrame for analysis. RDF
+implementations can only handle basic selections on cluster
+and track quantities; for more complicated tasks like jet
+finding or cluster-track matching, use example_analysis.py.
+
+To be used with alian/config/example/example_rdf.yaml
+"""
+
+import argparse
+from datetime import datetime as dt
+from time import perf_counter as timenow
+
+import numpy as np
+import ROOT
+from alian.analysis.base import AnalysisSelector, linbins, logbins, set_up_logger
+
+ROOT.TH1.SetDefaultSumw2() # applies to TH2 and TH3 as well
+
+class AnalysisQA:
+    def __init__(self, inf, outf, cfg):
+        self.logger = set_up_logger(__name__)
+        self.outf = outf
+        self.selector = AnalysisSelector.load(cfg)
+        self.df = ROOT.RDataFrame("eventTree", inf)
+        self.histos = {}
+
+    def note_time(self, msg):
+        self.logger.info(f"{msg}: -------- {timenow() - self.start_time:.3f} sec. --------", stacklevel = 2)
+    def note_start(self, msg):
+        self.start_time = timenow()
+        self.logger.info(f"{msg}: {dt.now().replace(microsecond=0)}", stacklevel = 2)
+
+    def analyze(self):
+        self.note_start("Starting analysis")
+        self.apply_cuts()
+        self.create_histos()
+        self.fill_and_save()
+        self.note_time("Finished analysis")
+
+    def apply_cuts(self):
+        """
+        Apply cuts to events, tracks, and clusters. The event selection
+        is just a Filter applied on the RDF. Cluster and track selections
+        are a little more complicated since they're vector columns, so
+        the implementation is hidden behind apply_to(). This method
+        creates a new set of columns for tracks and clusters (appended
+        with `_selected`) with the relevant cuts applied to them. Use
+        these column names to fill histograms from clusters or tracks
+        with the cuts applied, or use the default track_pt, track_phi; etc.
+        """
+        self.df = self.selector.event.apply_to(self.df)
+        # cluster pT isn't a column that exists already in the BerkeleyTree so we can create it:
+        # self.df = self.df.Define("cluster_pt", "cluster_energy / cosh(cluster_eta)")
+        # Applying the cluster selection will also automatically add it if it's not already defined:
+        self.df = self.selector.cluster.apply_to(self.df)
+        self.df = self.selector.track.apply_to(self.df)
+        self.note_time("Cuts applied")
+
+    def create_histos(self):
+        """
+        Define your histograms here. For Histo*D, the first argument is
+        a tuple corresponding to what you would use for a normal TH*.
+        The second is the column that your histogram will be Filled with.
+        Note that the histograms themselves are not filled until
+        fill_and_save() is called.
+        """
+        self.histos['track_pt'] = self.df.Histo1D(
+            ("track_pt", 'Track #it{p}_{T};track #it{p}_{T} (GeV);Counts', 300, logbins(0.15, 200, 300)),
+            "track_pt_selected"
+        )
+        self.histos['track_eta'] = self.df.Histo1D(
+            ("track_eta", 'Track #eta;track #eta;Counts', 200, -0.9, 0.9),
+            "track_eta_selected"
+        )
+        self.histos['track_phi'] = self.df.Histo1D(
+            ("track_phi", 'Track #phi;track #phi;Counts', 200, 0, 2*np.pi),
+            "track_phi_selected"
+        )
+        self.histos['track_eta_phi'] = self.df.Histo2D(
+            ("track_eta_phi", 'Track #eta-#phi distribution;#eta;#phi', 200, -0.9, 0.9, 200, 0, 2*np.pi),
+            "track_eta_selected", "track_phi_selected"
+        )
+        pt_bins = np.array([0.15, 0.5, 1, 5, 10, 15, 20, 30, 40, 50, 60, 100, 200])
+        self.histos['track_eta_phi_pt'] = self.df.Histo3D(
+            ("track_eta_phi_pT", 'Track #eta-#phi-#it{p}_{T} distribution;#eta;#phi;#it{p}_{T}',
+            100, linbins(-0.9, 0.9, 100),
+            100, linbins(0, 2*np.pi, 100),
+            len(pt_bins) - 1, pt_bins),
+            "track_eta_selected", "track_phi_selected", "track_pt_selected"
+        )
+
+        # these histograms will be filled with all clusters, not just those that pass selections,
+        # since we are not using the columns with `_selected` appended
+        self.histos['cluster_E'] = self.df.Histo1D(
+            ("cluster_E", 'Cluster energy distribution;#it{E} (GeV);Counts', 200, 0, 100),
+            "cluster_energy"
+        )
+        self.histos['cluster_pT'] = self.df.Histo1D(
+            ("cluster_pT", 'Cluster #it{p}_{T} distribution;#it{p}_{T} (GeV/#it{c});Counts', 200, 0, 100),
+            "cluster_pt"
+        )
+        self.histos['cluster_m02'] = self.df.Histo1D(
+            ("cluster_m02", 'Cluster shape;#it{M}_{02};Counts', 400, 0, 2),
+            "cluster_m02"
+        )
+        self.histos['cluster_time'] = self.df.Histo1D(
+            ("cluster_time", 'Cluster time;time (ns);Counts', 120, -30, 30),
+            "cluster_time"
+        )
+        self.histos['cluster_ncells'] = self.df.Histo1D(
+            ("cluster_ncells", 'Cluster cell multiplicity;n_{cells};Counts', 25, -0.5, 24.5),
+            "cluster_ncells"
+        )
+
+        self.note_time("Histograms defined")
+
+    def fill_and_save(self):
+        """
+        Fill and save all defined histograms. Note that, for histograms
+        defined in RDFs, filling is triggered when Write() is called.
+        """
+        self.note_time("Filling and saving histograms...")
+        with ROOT.TFile(self.outf, 'recreate'):
+            for h in self.histos.values():
+                h.Write()
+        self.note_time("Histograms saved")
+        self.logger.info(f"Output saved to: {self.outf}")
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description = __doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("-i", "--input",  type = str, required = True, help = "Input file or file containing a list of input files.")
+    parser.add_argument("-o", "--output", type = str, required = True, help = "File to write the analysis.")
+    parser.add_argument("-c", "--config", type = str, required = True, help = "YAML file describing the cuts to be applied to the data.")
+    args = parser.parse_args()
+
+    ROOT.EnableImplicitMT()
+    analysis = AnalysisQA(args.input, args.output, args.config)
+    analysis.analyze()
\ No newline at end of file
diff --git a/alian/analysis/example/merge.sh b/alian/analysis/example/merge.sh
new file mode 100755
index 0000000..0284970
--- /dev/null
+++ b/alian/analysis/example/merge.sh
@@ -0,0 +1,100 @@
+#!/usr/bin/bash
+
+# A simple script to merge files within a directory. Feel free to make a copy
+# and customize to your needs.
+
+show_help() {
+    cat << EOF
+Usage:
+  $0 -i|--input INPUT -o|--output OUTPUT [-n|--name FILENAME] [-f|--force] [-h|--help]
+
+Searches for files in a given directory INPUT and a given filename FILENAME,
+and merges them with ROOT hadd into a specified output file OUTPUT.
+The list of files that were merged will be placed in a text file in the
+same directory as the output file.
+
+Options:
+  -i, --input INPUT    Input directory to search for files
+  -o, --output OUTPUT  Filename for merged output
+  -n, --name NAME      Search for NAME in INPUT (default: analysis.root)
+  -j, --jobs [N]       Parallelize merging with N jobs (default: system max $(nproc))
+  -f, --force          Overwrite existing output file (default: false)
+  -h, --help           Show this help message
+
+EOF
+}
+
+if ! command -v hadd >/dev/null 2>&1; then echo "hadd not available, crashing." && exit 127; fi
+
+if [ $# -eq 0 ]; then
+    show_help && exit 0
+fi
+
+PARSEDARGS=$(getopt -o i:o:n:j:fh \
+                    --long input:,output:,name:,jobs:,force,help \
+                    -n 'merge' -- "$@")
+PARSE_EXIT=$?
+if [ $PARSE_EXIT -ne 0 ] ; then exit $PARSE_EXIT ; fi
+
+# NOTE: quotes around PARSEDARGS essential to parse spaces between parsed options
+eval set -- "$PARSEDARGS"
+
+input=
+output=
+filename=analysis.root
+force_opt=
+njobs=$(nproc)
+echo "Maximum cores: $njobs"
+
+while true; do
+    case "$1" in
+        -i | --input )  input="$2"; shift 2;;
+        -o | --output ) output="$2"; shift 2;;
+        -n | --name )   filename="$2"; shift 2;;
+        -j | --jobs )   if (( "$2" < njobs )); then
+                            njobs="$2";
+                        else
+                            echo "Not enough cores for $2 jobs, setting to maximum $njobs."; fi
+                        shift 2;;
+        -f | --force )  force_opt="-f"; shift ;;
+        -h | --help )   show_help; exit 0 ;;
+        -- ) shift; break ;;
+        * ) break ;;
+    esac
+done
+
+if [[ -z "$input" ]]; then
+    echo "Input directory must be passed:"
+    show_help
+    exit 1
+fi
+if [[ -z "$output" ]]; then
+    echo "Output filename must be passed:"
+    show_help
+    exit 1
+fi
+
+filelist="$(dirname -- "$(realpath "$output")")/list.txt"
+
+if [[ -z "$force_opt" ]]; then
+    if [[ -f $filelist ]]; then
+        echo "Filelist $filelist already exists; pass -f or --force to overwrite."
+        exit 1
+    fi
+    if [[ -f $output ]]; then
+        echo "Output file $output already exists; pass -f or --force to overwrite."
+        exit 1
+    fi
+fi
+
+echo "Searching in $input for files named $filename..."
+find "$input" -name "$filename" -type f > "$filelist"
+echo "Found $(wc -l < "$filelist") files to merge. Merging..."
+
+hadd -v 99 $force_opt -j "$njobs" "$output" @"$filelist"
+exitcode=$?
+if [[ $exitcode -ne 0 ]]; then
+    echo "Merging failed."
+    exit $exitcode
+fi
+echo "Merging completed."
diff --git a/alian/config/example/example_analysis.yaml b/alian/config/example/example_analysis.yaml
new file mode 100644
index 0000000..138649a
--- /dev/null
+++ b/alian/config/example/example_analysis.yaml
@@ -0,0 +1,65 @@
+# to be used with alian/analysis/example/example_analysis.py
+constants: # a YAML hack to define constants
+  pi34: &pi34 2.35619449019
+  halfpi: &pihalf 1.57079632679
+  pi: &pi 3.14159265359
+
+# Event cuts
+selections:
+  event:
+    event_sel: sel8
+    trig_sel: GammaHighPtEMCAL, GammaHighPtDCAL
+    rct: CBT_calo
+  track:
+    pt_min: 0.150
+    track_sel: globalTrack
+  # remove the cluster section below if clusters are not needed in analysis
+  # if clusters are needed but no selections should be applied, keep only `cluster:`
+  cluster:
+    energy_min: 0.7
+    m02_min: none
+    m02_max: none
+    ncells_min: 2
+    time_min: -30
+    time_max: 35
+    exoticity: True
+    nlm: 2
+    defn: 0
+    edge: True
+    delta_phi: 0.05
+    delta_eta: 0.05
+    ep_max: 1.75
+  isolation:
+    iso_pt_max: 1.5
+
+# remove the jet_finder section below if jets are not needed in analysis
+jet_finder:
+  R: 0.4
+  alg: antikt
+  pT_min: 10
+
+analysis:
+  photon_jet_angle_min: *pi34
+  pt_min_eec: 1.0
+
+output:
+  bins:
+    track_pT: ['log', 0.1, 20, 300]
+    iso_pT: ['lin', 0, 10, 100]
+    jet_pT: ['lin', 0, 200, 200]
+    jet_pT_coarse: ['custom', [5, 20, 40, 60, 80, 100, 150, 200]]
+    RL: ['log', 1.0e-3, 1, 30]
+    photon_E: ['lin', 0, 80, 200]
+    dphi: ['lin', *pihalf, *pi, 30]
+    xjy: ['lin', 0.5, 1.5, 30]
+  histograms:
+    # define histograms with bins described in `bins`
+    track_pT: [1, "track_pT", "Track #it{p}_{T} spectrum;#it{p}_{T} (GeV);dN/d#it{p}_{T}", track_pT]
+    tot_iso_pT: [1, "iso_pT", "Isolation;isolation #it{p}_{T} (GeV);Counts", iso_pT]
+    jet_pT: [1, "jet_pT", "Jet #it{p}_{T} spectrum;jet #it{p}_{T} (GeV);Counts", jet_pT]
+    jet_pT_coarse: [1, "jet_pT_coarse", "Jet #it{p}_{T} spectrum (coarse binning);jet #it{p}_{T} (GeV);Counts", jet_pT_coarse]
+    photon_E: [1, "photon_E", "Photon energy spectrum;photon energy (GeV);Counts", photon_E]
+    jet_photon_dphi: [1, "dphi", "Photon-jet #Delta#phi;#Delta#phi (radians);Counts", dphi]
+    y_jet_pT: [1, "y_jet_pT", "Photon-tagged jet #it{p}_{T} spectrum;jet #it{p}_{T} (GeV);Counts", jet_pT]
+    eec: [2, "eec", "Photon-tagged EEC;jet #it{p}_{T} (GeV);#it{R}_{L}", jet_pT, RL]
+    xjy: [2, "xjy", "Photon-jet imbalance;#it{x}_{J#gamma};photon energy (Gev)", xjy, photon_E]
diff --git a/alian/config/example/example_rdf.yaml b/alian/config/example/example_rdf.yaml
new file mode 100644
index 0000000..f694da0
--- /dev/null
+++ b/alian/config/example/example_rdf.yaml
@@ -0,0 +1,25 @@
+# to be used with alian/analysis/example/example_rdf.py
+selections:
+  event:
+    event_sel: sel8
+    trig_sel: GammaHighPtEMCAL, GammaHighPtDCAL
+    rct: CBT_calo
+  track:
+    pt_min: 0.150
+    track_sel: globalTrack
+  cluster:
+    energy_min: 0.7
+    m02_min: none
+    m02_max: none
+    ncells_min: 2
+    time_min: -30
+    time_max: 35
+    exoticity: True
+    nlm: 2
+    defn: 0
+    edge: True
+    delta_phi: 0.05
+    delta_eta: 0.05
+    ep_max: 1.75
+  isolation:
+    iso_pt_max: 1.5