diff --git a/src/jabs/classifier/__init__.py b/src/jabs/classifier/__init__.py
index 4c576d1e..e4751575 100644
--- a/src/jabs/classifier/__init__.py
+++ b/src/jabs/classifier/__init__.py
@@ -10,16 +10,20 @@
 from .multi_class_classifier import MultiClassClassifier
 from .protocols import ClassifierProtocol
 from .training_report import (
+    BinaryCVResult,
     CrossValidationResult,
+    MultiClassCVResult,
     TrainingReportData,
     generate_markdown_report,
     save_training_report,
 )
 
 __all__ = [
+    "BinaryCVResult",
     "Classifier",
     "ClassifierProtocol",
     "CrossValidationResult",
+    "MultiClassCVResult",
     "MultiClassClassifier",
     "TrainingReportData",
     "generate_markdown_report",
diff --git a/src/jabs/classifier/base.py b/src/jabs/classifier/base.py
new file mode 100644
index 00000000..c6b1ce24
--- /dev/null
+++ b/src/jabs/classifier/base.py
@@ -0,0 +1,328 @@
+"""Shared infrastructure for behavior classifiers.
+
+``BaseClassifier`` consolidates persistence, identity properties, factory
+dispatch, feature cleaning, and feature-importance reporting that are common
+to both binary and multi-class classifiers. Subclasses provide the train and
+predict implementations that determine the actual learning behavior.
+
+The class is concrete (not abstract): subclasses are not required to
+override anything to instantiate. Public surface for classifier *consumers*
+is governed by :class:`jabs.classifier.ClassifierProtocol`, which both
+subclasses satisfy structurally.
+"""
+
+from __future__ import annotations
+
+import typing
+import warnings
+from pathlib import Path
+from typing import ClassVar
+
+import joblib
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+from sklearn.exceptions import InconsistentVersionWarning
+
+from jabs.core.enums import ClassifierType
+from jabs.core.utils import hash_file
+
+from . import classifier_utils, factories
+
+
+class BaseClassifier:
+    """Shared persistence and identity machinery for JABS classifiers.
+
+    Class attributes that subclasses must set:
+        ``_VERSION``: pickled-format version integer for this subclass.
+        ``_MULTICLASS``: True if this subclass operates in multi-class mode.
+        ``_PERSISTED_REQUIRED``: tuple of instance attribute names that ``load``
+            must restore from the pickled instance.
+        ``_PERSISTED_OPTIONAL``: tuple of instance attribute names that
+            ``load`` should restore if present on the pickled instance (default
+            to ``None`` otherwise). Used to support older pickles that may not
+            have all attributes the live class now declares.
+    """
+
+    _VERSION: ClassVar[int] = 0
+    _MULTICLASS: ClassVar[bool] = False
+    _PERSISTED_REQUIRED: ClassVar[tuple[str, ...]] = ()
+    _PERSISTED_OPTIONAL: ClassVar[tuple[str, ...]] = ()
+
+    def __init__(self, classifier_type: ClassifierType, n_jobs: int = 1) -> None:
+        self._classifier_type = classifier_type
+        self._classifier: typing.Any = None
+        self._project_settings: dict | None = None
+        self._feature_names: list[str] | None = None
+        self._n_jobs = n_jobs
+        self._version = self._VERSION
+
+        self._classifier_file: str | None = None
+        self._classifier_hash: str | None = None
+        self._classifier_source: str | None = None
+
+        self._supported_classifiers = self._supported_classifier_choices()
+        if classifier_type not in self._supported_classifiers:
+            raise ValueError("Invalid classifier type")
+
+    @property
+    def classifier_name(self) -> str:
+        """Return the name of the underlying algorithm."""
+        return self._classifier_type.value
+
+    @property
+    def classifier_type(self) -> ClassifierType:
+        """Return the underlying classifier algorithm enum value."""
+        return self._classifier_type
+
+    @property
+    def classifier_file(self) -> str | None:
+        """Return the filename of the saved classifier, if any."""
+        return self._classifier_file
+
+    @property
+    def classifier_hash(self) -> str | None:
+        """Return the content hash of the saved classifier, if any."""
+        return self._classifier_hash
+
+    @property
+    def project_settings(self) -> dict:
+        """Return a copy of the classifier's training settings."""
+        if self._project_settings is not None:
+            return dict(self._project_settings)
+        return {}
+
+    @property
+    def version(self) -> int:
+        """Return the serialized classifier format version."""
+        return self._version
+
+    @property
+    def feature_names(self) -> list[str] | None:
+        """Return the list of feature names used to train this classifier."""
+        return self._feature_names
+
+    @classmethod
+    def _supported_classifier_choices(cls) -> set[ClassifierType]:
+        """Return classifier types available in the current environment.
+
+        Resolved per-call so that test code can patch
+        :func:`jabs.classifier.factories.supported_classifier_types` or this
+        method on the subclass without freezing state at import time.
+        """
+        return factories.supported_classifier_types(multiclass=cls._MULTICLASS)
+
+    def set_classifier(self, classifier: ClassifierType) -> None:
+        """Switch the underlying classifier algorithm.
+
+        Args:
+            classifier: The classifier type to switch to.
+
+        Raises:
+            ValueError: If the classifier type is not supported.
+        """
+        if classifier not in self._supported_classifier_choices():
+            raise ValueError("Invalid Classifier Type")
+        self._classifier_type = classifier
+
+    def set_dict_settings(self, settings: dict) -> None:
+        """Assign classifier settings from a dictionary.
+
+        Args:
+            settings: dict of settings (same structure as
+                ``project.settings_manager.get_behavior``).
+        """
+        self._project_settings = dict(settings)
+
+    def classifier_choices(self) -> dict[ClassifierType, str]:
+        """Return the available classifier types as a sorted display map.
+
+        Returns:
+            dict mapping ``ClassifierType`` enum values to their string names.
+        """
+        return {t: t.value for t in sorted(self._supported_classifiers, key=lambda t: t.value)}
+
+    def _create_classifier(self, random_seed: int | None = None) -> typing.Any:
+        """Instantiate the underlying sklearn/xgboost/catboost classifier."""
+        factory = factories.get_factory(self._classifier_type, multiclass=self._MULTICLASS)
+        return factory(self._n_jobs, random_seed)
+
+    def _clean_features(self, features: pd.DataFrame) -> pd.DataFrame:
+        """Replace ±inf/NaN in feature matrix per classifier type."""
+        return classifier_utils.clean_features(features, self._classifier_type)
+
+    def _get_features_to_classify(self, features: pd.DataFrame) -> pd.DataFrame:
+        """Reorder/select feature columns to match the trained model.
+
+        Args:
+            features: DataFrame of feature data to filter.
+
+        Returns:
+            DataFrame containing only the columns the trained model expects,
+            in the order the model expects them.
+
+        Raises:
+            RuntimeError: If feature names cannot be obtained from the model.
+        """
+        if self._classifier_type == ClassifierType.XGBOOST:
+            classifier_columns = self._classifier.get_booster().feature_names
+        elif hasattr(self._classifier, "feature_names_in_"):
+            classifier_columns = list(self._classifier.feature_names_in_)
+        elif hasattr(self._classifier, "feature_names_"):
+            classifier_columns = list(self._classifier.feature_names_)
+        else:
+            raise RuntimeError("Error obtaining feature names from classifier.")
+        return features[classifier_columns]
+
+    @staticmethod
+    def combine_data(per_frame: pd.DataFrame, window: pd.DataFrame) -> pd.DataFrame:
+        """Combine per-frame and window feature DataFrames into one."""
+        return classifier_utils.combine_data(per_frame, window)
+
+    @staticmethod
+    def derive_predictions(
+        probabilities: npt.NDArray[np.floating],
+    ) -> tuple[npt.NDArray[np.int8], npt.NDArray[np.floating]]:
+        """Derive class predictions and confidence from class probabilities.
+
+        Args:
+            probabilities: Array of shape ``(n_frames, n_classes)`` of predicted
+                class probabilities.
+
+        Returns:
+            Tuple ``(predictions, confidence)`` where ``predictions`` is the
+            argmax class index per frame (``-1`` if confidence is zero,
+            indicating no pose) and ``confidence`` is the probability of the
+            chosen class.
+        """
+        predictions = np.argmax(probabilities, axis=1).astype(np.int8)
+        confidence = probabilities[np.arange(len(probabilities)), predictions]
+        predictions[confidence == 0] = -1
+        return predictions, confidence
+
+    def get_feature_importance(self, limit: int = 20) -> list[tuple[str, float]]:
+        """Return ranked feature importances, highest first.
+
+        Args:
+            limit: Maximum number of features to return.
+
+        Returns:
+            List of ``(feature_name, importance)`` tuples sorted by importance
+            descending. Returns an empty list if the classifier is untrained or
+            does not expose feature importances.
+        """
+        if self._classifier is None or self._feature_names is None:
+            return []
+        if not hasattr(self._classifier, "feature_importances_"):
+            return []
+        importances = list(np.asarray(self._classifier.feature_importances_).reshape(-1))
+        feature_importance = [
+            (feature, round(importance, 2))
+            for feature, importance in zip(self._feature_names, importances, strict=True)
+        ]
+        feature_importance.sort(key=lambda x: x[1], reverse=True)
+        return feature_importance[:limit]
+
+    def save(self, path: Path) -> None:
+        """Serialize the classifier to disk using joblib.
+
+        Args:
+            path: Destination file path.
+        """
+        joblib.dump(self, path)
+        if self._classifier_file is None:
+            self._classifier_file = Path(path).name
+            self._classifier_hash = hash_file(Path(path))
+            self._classifier_source = "serialized"
+
+    @classmethod
+    def from_pickle(cls, path: Path) -> BaseClassifier:
+        """Load a classifier from a pickle file with full validation and metadata backfill.
+
+        Applies the same version, classifier-type, and metadata checks as
+        :meth:`load`, but as a classmethod factory so no dummy instance is
+        required. The class of the returned object is determined by the
+        calling class - ``Classifier.from_pickle(...)`` rejects pickled
+        ``MultiClassClassifier`` instances and vice versa.
+
+        Args:
+            path: Path to the saved classifier pickle file.
+
+        Returns:
+            Loaded and validated classifier instance of type ``cls``.
+
+        Raises:
+            ValueError: If the file is not an instance of ``cls``, was trained
+                with an incompatible sklearn or JABS version, or uses an
+                unsupported classifier type.
+        """
+        with warnings.catch_warnings(record=True) as caught_warnings:
+            warnings.simplefilter("always", InconsistentVersionWarning)
+            c = joblib.load(path)
+            for warning in caught_warnings:
+                if issubclass(warning.category, InconsistentVersionWarning):
+                    raise ValueError("Classifier trained with different version of sklearn.")
+                warnings.warn(warning.message, warning.category, stacklevel=2)
+
+        if not isinstance(c, cls):
+            raise ValueError(f"{path} is not an instance of {cls.__name__}")
+
+        if c._version != cls._VERSION:
+            raise ValueError(
+                f"Unable to deserialize pickled classifier. "
+                f"File version {c._version}, expected {cls._VERSION}."
+            )
+
+        if c._classifier_type not in cls._supported_classifier_choices():
+            raise ValueError("Invalid classifier type")
+
+        if c._classifier_file is None:
+            c._classifier_file = Path(path).name
+            c._classifier_hash = hash_file(Path(path))
+            c._classifier_source = "pickle"
+
+        return c
+
+    def load(self, path: Path) -> None:
+        """Deserialize a classifier from disk, updating this instance in place.
+
+        Args:
+            path: Source file path.
+
+        Raises:
+            ValueError: If the file is not an instance of this class, was saved
+                with a different version, or uses an unsupported classifier type.
+        """
+        with warnings.catch_warnings(record=True) as caught_warnings:
+            warnings.simplefilter("always", InconsistentVersionWarning)
+            c = joblib.load(path)
+            for warning in caught_warnings:
+                if issubclass(warning.category, InconsistentVersionWarning):
+                    raise ValueError("Classifier trained with different version of sklearn.")
+                warnings.warn(warning.message, warning.category, stacklevel=2)
+
+        if not isinstance(c, type(self)):
+            raise ValueError(f"{path} is not an instance of {type(self).__name__}")
+
+        if c._version != self._VERSION:
+            raise ValueError(
+                f"Unable to deserialize pickled classifier. "
+                f"File version {c._version}, expected {self._VERSION}."
+            )
+
+        if c._classifier_type not in self._supported_classifiers:
+            raise ValueError("Invalid classifier type")
+
+        for attr in self._PERSISTED_REQUIRED:
+            setattr(self, attr, getattr(c, attr))
+        for attr in self._PERSISTED_OPTIONAL:
+            setattr(self, attr, getattr(c, attr, None))
+
+        if c._classifier_file is not None:
+            self._classifier_file = c._classifier_file
+            self._classifier_hash = c._classifier_hash
+            self._classifier_source = c._classifier_source
+        else:
+            self._classifier_file = Path(path).name
+            self._classifier_hash = hash_file(Path(path))
+            self._classifier_source = "pickle"
diff --git a/src/jabs/classifier/classifier.py b/src/jabs/classifier/classifier.py
index bc21a58f..fdb9ab5a 100644
--- a/src/jabs/classifier/classifier.py
+++ b/src/jabs/classifier/classifier.py
@@ -1,12 +1,12 @@
+"""Binary behavior classifier (behavior vs. not-behavior)."""
+
 import logging
-import typing
 import warnings
 from pathlib import Path
+from typing import ClassVar
 
-import joblib
 import numpy as np
 import pandas as pd
-from sklearn.exceptions import InconsistentVersionWarning
 
 from jabs.core.enums import (
     DEFAULT_CV_GROUPING_STRATEGY,
@@ -14,63 +14,53 @@
     CrossValidationGroupingStrategy,
 )
 from jabs.core.utils import hash_file
-from jabs.project import Project, TrackLabels, load_training_data
+from jabs.project import Project, load_training_data
 
 from . import classifier_utils
-from .factories import XGBOOST_AVAILABLE, make_catboost, make_random_forest, make_xgboost
-
-_VERSION = 11
+from .base import BaseClassifier
 
-# _CLASSIFIER_FACTORIES serves as both the single source of truth for classifiers
-# supported by the current JABS environment, in addition to the mapping of ClassifierTypes
-# to factory functions that produce instantiated classifiers for that type.
-# XGBoost availability is detected once in factories.py; import XGBOOST_AVAILABLE here
-# rather than re-probing so that the warning is emitted exactly once.
-_CLASSIFIER_FACTORIES: dict[ClassifierType, typing.Callable[[int, int | None], typing.Any]] = {
-    ClassifierType.RANDOM_FOREST: make_random_forest,
-    ClassifierType.CATBOOST: make_catboost,
-}
+logger = logging.getLogger(__name__)
 
-if XGBOOST_AVAILABLE:
-    _CLASSIFIER_FACTORIES[ClassifierType.XGBOOST] = make_xgboost
 
+class Classifier(BaseClassifier):
+    """A binary behavior classifier (behavior vs. not-behavior).
 
-class Classifier:
-    """A machine learning classifier for behavior classification tasks.
-
-    This class supports training, evaluating, saving, and loading classifiers
-    for behavioral data using Random Forest or XGBoost algorithms.
-    It provides utilities for data splitting, balancing, augmentation, and feature management.
+    Supports training, evaluating, saving, and loading classifiers for
+    behavioral data using Random Forest, CatBoost, or XGBoost algorithms.
+    Persistence and identity machinery are inherited from
+    :class:`BaseClassifier`.
 
     Attributes:
-        LABEL_THRESHOLD (int): Minimum number of labels required per group.
+        LABEL_THRESHOLD: Minimum number of labels required per group.
     """
 
-    LABEL_THRESHOLD = 20
-
-    def __init__(self, classifier: ClassifierType = ClassifierType.RANDOM_FOREST, n_jobs: int = 1):
-        self._classifier_type = classifier
-        self._classifier = None
-        self._project_settings = None
-        self._behavior = None
-        self._feature_names = None
-        self._n_jobs = n_jobs
-        self._version = _VERSION
-
-        self._classifier_file = None
-        self._classifier_hash = None
-        self._classifier_source = None
-        self._supported_classifiers = self._supported_classifier_choices()
-
-        # make sure the value passed for the classifier parameter is valid
-        if classifier not in self._supported_classifiers:
-            raise ValueError("Invalid classifier type")
+    LABEL_THRESHOLD: ClassVar[int] = classifier_utils.LABEL_THRESHOLD
+
+    _VERSION: ClassVar[int] = 11
+    _MULTICLASS: ClassVar[bool] = False
+    _PERSISTED_REQUIRED: ClassVar[tuple[str, ...]] = (
+        "_classifier",
+        "_behavior",
+        "_project_settings",
+        "_classifier_type",
+        "_feature_names",
+    )
+
+    def __init__(
+        self,
+        classifier: ClassifierType = ClassifierType.RANDOM_FOREST,
+        n_jobs: int = 1,
+    ) -> None:
+        super().__init__(classifier_type=classifier, n_jobs=n_jobs)
+        self._behavior: str | None = None
 
     @classmethod
-    def from_training_file(cls, path: Path, classifier_type: ClassifierType | None = None):
+    def from_training_file(
+        cls, path: Path, classifier_type: ClassifierType | None = None
+    ) -> "Classifier":
         """Initialize a classifier from an exported training data file.
 
-        This method will load the training data and train a classifier.
+        This method loads the training data and trains a classifier.
 
         Args:
             path: exported training data file
@@ -78,7 +68,7 @@ def from_training_file(cls, path: Path, classifier_type: ClassifierType | None =
                 file. If ``None``, the type recorded in the file is used.
 
         Returns:
-            trained classifier object
+            trained Classifier object
         """
         loaded_training_data, _ = load_training_data(path)
         behavior = loaded_training_data["behavior"]
@@ -91,8 +81,10 @@ def from_training_file(cls, path: Path, classifier_type: ClassifierType | None =
         if effective_type in classifier._supported_classifiers:
             classifier.set_classifier(effective_type)
         else:
-            logging.warning(
-                f"Specified classifier type {effective_type.name} is unavailable, using default: {classifier.classifier_type.name}"
+            logger.warning(
+                "Specified classifier type %s is unavailable, using default: %s",
+                effective_type.name,
+                classifier.classifier_type.name,
             )
         training_features = classifier.combine_data(
             loaded_training_data["per_frame"], loaded_training_data["window"]
@@ -111,94 +103,41 @@ def from_training_file(cls, path: Path, classifier_type: ClassifierType | None =
 
         return classifier
 
-    @property
-    def classifier_name(self) -> str:
-        """return the name of the classifier used as a string"""
-        return self._classifier_type.value
-
-    @property
-    def classifier_type(self) -> ClassifierType:
-        """return classifier type"""
-        return self._classifier_type
-
-    @property
-    def classifier_file(self) -> str | None:
-        """return the filename of the saved classifier"""
-        return self._classifier_file
-
-    @property
-    def classifier_hash(self) -> str | None:
-        """return the hash of the classifier file"""
-        return self._classifier_hash
-
-    @property
-    def project_settings(self) -> dict:
-        """return a copy of dictionary of project settings for this classifier"""
-        if self._project_settings is not None:
-            return dict(self._project_settings)
-        return {}
-
     @property
     def behavior_name(self) -> str | None:
-        """return the behavior name property"""
+        """Return the behavior name property."""
         return self._behavior
 
     @behavior_name.setter
-    def behavior_name(self, value) -> None:
-        """set the behavior name property"""
+    def behavior_name(self, value: str | None) -> None:
+        """Set the behavior name property."""
         self._behavior = value
 
-    @property
-    def version(self) -> int:
-        """return the classifier format version"""
-        return self._version
-
-    @property
-    def feature_names(self) -> list[str] | None:
-        """returns the list of feature names used when training this classifier"""
-        return self._feature_names
-
     @staticmethod
-    def get_leave_one_group_out_max(labels, groups):
-        """counts the number of possible leave one out groups for k-fold cross validation
+    def get_leave_one_group_out_max(labels: np.ndarray, groups: np.ndarray) -> int:
+        """Count the number of possible leave-one-group-out splits.
 
         Args:
-            labels: labels to check if they were above the threshold
-            groups: group id corresponding to the labels
+            labels: Labels to check against the per-class threshold.
+            groups: Group id corresponding to each label.
 
         Returns:
-            int of the maximum number of cross validation to use
+            Number of groups that can serve as a valid test split.
 
         Note: labels excludes label for frames with no identity.
         """
-        labels = np.asarray(labels)
-        groups = np.asarray(groups)
-        unique_groups = np.unique(groups)
-        count = 0
-        for g in unique_groups:
-            test_mask = groups == g
-            test_labels = labels[test_mask]
-            train_labels = labels[~test_mask]
-            # Test split must have both classes above threshold.
-            test_ok = (
-                np.sum(test_labels == TrackLabels.Label.BEHAVIOR) >= Classifier.LABEL_THRESHOLD
-                and np.sum(test_labels == TrackLabels.Label.NOT_BEHAVIOR)
-                >= Classifier.LABEL_THRESHOLD
-            )
-            # Training split must also have both classes above threshold so the
-            # model can learn every class regardless of which group is held out.
-            train_ok = (
-                np.sum(train_labels == TrackLabels.Label.BEHAVIOR) >= Classifier.LABEL_THRESHOLD
-                and np.sum(train_labels == TrackLabels.Label.NOT_BEHAVIOR)
-                >= Classifier.LABEL_THRESHOLD
-            )
-            if test_ok and train_ok:
-                count += 1
-        return count
+        return classifier_utils.count_valid_logo_splits(
+            labels, groups, label_threshold=Classifier.LABEL_THRESHOLD
+        )
 
     @staticmethod
-    def leave_one_group_out(per_frame_features, window_features, labels, groups):
-        """implements "leave one group out" data splitting strategy
+    def leave_one_group_out(
+        per_frame_features: pd.DataFrame,
+        window_features: pd.DataFrame,
+        labels: np.ndarray,
+        groups: np.ndarray,
+    ):
+        """Yield "leave one group out" train/test splits.
 
         Args:
             per_frame_features: per frame features for all labeled data
@@ -206,15 +145,9 @@ def leave_one_group_out(per_frame_features, window_features, labels, groups):
             labels: labels corresponding to each feature row
             groups: group id corresponding to each feature row
 
-        Returns:
-            dictionary of training and test data and labels:
-        {
-            'training_data': list of numpy arrays,
-            'test_data': list of numpy arrays,
-            'training_labels': numpy array,
-            'test_labels': numpy_array,
-            'feature_names': list of feature names
-        }
+        Yields:
+            Dict with training_data, test_data, training_labels, test_labels,
+            and feature_names.
         """
         yield from classifier_utils.leave_one_group_out(
             per_frame_features,
@@ -225,109 +158,55 @@ def leave_one_group_out(per_frame_features, window_features, labels, groups):
         )
 
     @staticmethod
-    def downsample_balance(features, labels, random_seed=None):
-        """downsamples features and labels such that labels are equally distributed
-
-        Args:
-            features: features to downsample
-            labels: labels to downsample
-            random_seed: optional random seed
-
-        Returns:
-            tuple of downsampled features, labels
-        """
+    def downsample_balance(
+        features: pd.DataFrame, labels: np.ndarray, random_seed: int | None = None
+    ):
+        """Downsample features and labels to an equal class distribution."""
         return classifier_utils.downsample_balance(features, labels, random_seed)
 
     @staticmethod
-    def augment_symmetric(features, labels, random_str="ASygRQDZJD"):
-        """augments the features to include L-R and R-L duplicates
-
-        This requires 'left' or 'right' to be in the feature name to be swapped
-        Features that don't include these terms will not be swapped
-
-        Args:
-            features: features to augment
-            labels: labels to augment
-            random_str: a random string to use as a temporary
-                replacement when swapping left/right
-
-        Returns:
-            tuple of augmented features, labels
-        """
+    def augment_symmetric(
+        features: pd.DataFrame, labels: np.ndarray, random_str: str = "ASygRQDZJD"
+    ):
+        """Augment features with left/right reflected duplicates."""
         return classifier_utils.augment_symmetric(features, labels, random_str)
 
-    def set_classifier(self, classifier: ClassifierType):
-        """change the type of the classifier being used"""
-        if classifier not in self._supported_classifiers:
-            raise ValueError("Invalid Classifier Type")
-        self._classifier_type = classifier
+    def set_project_settings(self, project: Project) -> None:
+        """Assign project settings to the classifier.
 
-    def set_project_settings(self, project: Project):
-        """assign project settings to the classifier
+        If no behavior is currently set, uses project defaults; otherwise looks
+        up the behavior-scoped settings from the project's settings manager.
 
         Args:
-            project: project to copy classifier-relevant settings from for the current behavior
-
-        if no behavior is currently set will use project defaults
+            project: Project to copy classifier-relevant settings from.
         """
         if self._behavior is None:
             self._project_settings = project.get_project_defaults()
         else:
             self._project_settings = project.settings_manager.get_behavior(self._behavior)
 
-    def set_dict_settings(self, settings: dict):
-        """assign project settings via a dict to the classifier
+    def train(self, data: dict, random_seed: int | None = None) -> None:
+        """Train the classifier.
 
         Args:
-            settings: dict of project settings. Must be same structure as project.settings_manager.get_behavior
-
-        TODO: Add checks to enforce conformity to project settings
-        """
-        self._project_settings = dict(settings)
-
-    def classifier_choices(self):
-        """get the available classifier types
-
-        Returns:
-            dict where keys are ClassifierType enum values, and the
-              values are string names for the classifiers.
-        """
-        return {t: t.value for t in sorted(self._supported_classifiers, key=lambda t: t.value)}
-
-    def _create_classifier(self, random_seed: int | None = None):
-        """Instantiate the underlying classifier for the current classifier type."""
-        try:
-            factory = _CLASSIFIER_FACTORIES[self._classifier_type]
-        except KeyError:
-            raise ValueError(f"Unsupported classifier type: {self._classifier_type!r}") from None
-        return factory(self._n_jobs, random_seed)
-
-    def train(self, data, random_seed: int | None = None):
-        """train the classifier
+            data: dict returned from train_test_split().
+            random_seed: optional random seed for reproducibility.
 
-        Args:
-            data: dict returned from train_test_split()
-            random_seed: optional random seed (used when we want
-                reproducible results between trainings)
-
-        Returns:
-            None
-
-        raises ValueError for having either unset project settings or an unset classifier
+        Raises:
+            ValueError: If project settings are unset.
         """
         if self._project_settings is None:
             raise ValueError("Project settings for classifier unset, cannot train classifier.")
 
-        # Assume that feature names is provided, otherwise extract it from the dataframe
         if "feature_names" in data:
             self._feature_names = data["feature_names"]
         else:
             self._feature_names = data["training_data"].columns.to_list()
 
-        # Obtain the feature and label matrices
         features = data["training_data"]
         labels = data["training_labels"]
-        # Symmetric augmentation should occur before balancing so that the class with more labels can sample from the whole set
+        # Symmetric augmentation should occur before balancing so that the
+        # class with more labels can sample from the whole set.
         if self._project_settings.get("symmetric_behavior", False):
             features, labels = self.augment_symmetric(features, labels)
         if self._project_settings.get("balance_labels", False):
@@ -339,45 +218,28 @@ def train(self, data, random_seed: int | None = None):
             warnings.simplefilter("ignore", category=FutureWarning)
             self._classifier = classifier.fit(cleaned_features, labels)
 
-        # Classifier may have been re-used from a prior training, blank the logging attributes
         self._classifier_file = None
         self._classifier_hash = None
         self._classifier_source = None
 
-    def get_features_to_classify(self, features: pd.DataFrame) -> pd.DataFrame:
-        """gets features for classification, handling classifier-specific quirks."""
-        if self.classifier_type == ClassifierType.XGBOOST:
-            # XGBoost feature names are obtained from the booster
-            classifier_columns = self._classifier.get_booster().feature_names
-        else:
-            # For other classifiers, use the feature names from the underlying model
-            if hasattr(self._classifier, "feature_names_in_"):
-                classifier_columns = list(self._classifier.feature_names_in_)
-            elif hasattr(self._classifier, "feature_names_"):
-                classifier_columns = list(self._classifier.feature_names_)
-            else:
-                raise RuntimeError("Error obtaining feature names from classifier.")
-
-        return features[classifier_columns]
-
     def predict(
         self, features: pd.DataFrame, frame_indexes: np.ndarray | None = None
     ) -> np.ndarray:
-        """predict classes for a given set of features
+        """Predict classes for a given set of features.
 
         Args:
-            features: DataFrame of feature data to classify
-            frame_indexes: frame indexes to classify (default all)
+            features: DataFrame of feature data to classify.
+            frame_indexes: Frame indexes to classify (default all).
 
         Returns:
-            predicted class vector
+            Predicted class vector. Frames absent from ``frame_indexes`` are
+            assigned -1.
         """
-        cleaned_features = self.get_features_to_classify(self._clean_features(features))
+        cleaned_features = self._get_features_to_classify(self._clean_features(features))
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", category=FutureWarning)
             result = self._classifier.predict(cleaned_features)
 
-        # Insert -1s into class prediction when no prediction is made
         if frame_indexes is not None:
             result_adjusted = np.full(result.shape, -1, dtype=np.int8)
             result_adjusted[frame_indexes] = result[frame_indexes]
@@ -388,21 +250,21 @@ def predict(
     def predict_proba(
         self, features: pd.DataFrame, frame_indexes: np.ndarray | None = None
     ) -> np.ndarray:
-        """predict probabilities for a given set of features.
+        """Predict probabilities for a given set of features.
 
         Args:
-            features: DataFrame of feature data to classify
-            frame_indexes: frame indexes to classify (default all)
+            features: DataFrame of feature data to classify.
+            frame_indexes: Frame indexes to classify (default all).
 
         Returns:
-            prediction probability matrix
+            Prediction probability matrix. Frames absent from ``frame_indexes``
+            are assigned zero probabilities.
         """
-        cleaned_features = self.get_features_to_classify(self._clean_features(features))
+        cleaned_features = self._get_features_to_classify(self._clean_features(features))
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", category=FutureWarning)
             result = self._classifier.predict_proba(cleaned_features)
 
-        # Insert 0 probabilities when no prediction is made
         if frame_indexes is not None:
             result_adjusted = np.full(result.shape, 0, dtype=np.float32)
             result_adjusted[frame_indexes] = result[frame_indexes]
@@ -410,209 +272,52 @@ def predict_proba(
 
         return result
 
-    def save(self, path: Path):
-        """save the classifier to a file
-
-        Uses joblib to serialize the classifier object to a file.
-        """
-        joblib.dump(self, path)
-
-        # If the classifier was not generated from exported training data
-        # we can hash the serialized classifier.
-        # Note that this hash changes every time the "train" button is
-        # pressed, regardless of whether the training data changes.
-        if self._classifier_file is None:
-            self._classifier_file = Path(path).name
-            self._classifier_hash = hash_file(Path(path))
-            self._classifier_source = "serialized"
-
-    @classmethod
-    def from_pickle(cls, path: Path) -> "Classifier":
-        """Load a Classifier from a pickle file with full validation and metadata backfill.
-
-        Applies the same version, classifier-type, and metadata checks as ``load()``,
-        but as a classmethod factory so no dummy instance is required.
+    def print_feature_importance(self, limit: int = 20) -> None:
+        """Print the most important features and their importance.
 
         Args:
-            path: Path to the saved classifier pickle file.
-
-        Returns:
-            Loaded and validated ``Classifier`` instance.
-
-        Raises:
-            ValueError: If the file is not a ``Classifier``, was trained with an
-                incompatible sklearn or JABS version, or uses an unsupported
-                classifier type.
-        """
-        with warnings.catch_warnings(record=True) as caught_warnings:
-            warnings.simplefilter("always", InconsistentVersionWarning)
-            c = joblib.load(path)
-            for warning in caught_warnings:
-                if issubclass(warning.category, InconsistentVersionWarning):
-                    raise ValueError("Classifier trained with different version of sklearn.")
-                else:
-                    warnings.warn(warning.message, warning.category, stacklevel=2)
-
-        if not isinstance(c, cls):
-            raise ValueError(f"{path} is not an instance of Classifier")
-
-        if c.version != _VERSION:
-            raise ValueError(
-                f"Unable to deserialize pickled classifier. File version {c.version}, expected {_VERSION}."
-            )
-
-        if c._classifier_type not in cls._supported_classifier_choices():
-            raise ValueError("Invalid classifier type")
-
-        if c._classifier_file is None:
-            c._classifier_file = Path(path).name
-            c._classifier_hash = hash_file(Path(path))
-            c._classifier_source = "pickle"
-
-        return c
-
-    def load(self, path: Path):
-        """load a classifier from a file
-
-        Uses joblib to deserialize the classifier object that was previously saved
-        using the joblib.dump() method.
+            limit: Maximum number of features to print.
         """
-        with warnings.catch_warnings(record=True) as caught_warnings:
-            warnings.simplefilter("always", InconsistentVersionWarning)
-            c = joblib.load(path)
-            for warning in caught_warnings:
-                if issubclass(warning.category, InconsistentVersionWarning):
-                    raise ValueError("Classifier trained with different version of sklearn.")
-                else:
-                    warnings.warn(warning.message, warning.category, stacklevel=2)
-
-        if not isinstance(c, Classifier):
-            raise ValueError(f"{path} is not instance of Classifier")
-
-        if c.version != _VERSION:
-            raise ValueError(
-                f"Unable to deserialize pickled classifier. File version {c.version}, expected {_VERSION}."
-            )
-
-            # make sure the value passed for the classifier parameter is valid
-        if c._classifier_type not in self._supported_classifiers:
-            raise ValueError("Invalid classifier type")
-
-        self._classifier = c._classifier
-        self._behavior = c._behavior
-        self._project_settings = c._project_settings
-        self._classifier_type = c._classifier_type
-        if c._classifier_file is not None:
-            self._classifier_file = c._classifier_file
-            self._classifier_hash = c._classifier_hash
-            self._classifier_source = c._classifier_source
-        else:
-            self._classifier_file = Path(path).name
-            self._classifier_hash = hash_file(Path(path))
-            self._classifier_source = "pickle"
+        feature_importance = self.get_feature_importance(limit=limit)
+        print(f"{'Feature Name':100} Importance")
+        print("-" * 120)
+        for feature, importance in feature_importance[:limit]:
+            print(f"{feature:100} {importance:0.2f}")
 
     @staticmethod
-    def accuracy_score(truth, predictions):
-        """return accuracy score"""
+    def accuracy_score(truth: np.ndarray, predictions: np.ndarray) -> float:
+        """Return accuracy score."""
         return classifier_utils.accuracy_score(truth, predictions)
 
     @staticmethod
-    def precision_recall_score(truth, predictions):
-        """return precision recall score"""
+    def precision_recall_score(truth: np.ndarray, predictions: np.ndarray):
+        """Return precision/recall/f-score/support."""
         return classifier_utils.precision_recall_score(truth, predictions)
 
     @staticmethod
-    def confusion_matrix(truth, predictions):
-        """return the confusion matrix using sklearn's confusion_matrix function"""
+    def confusion_matrix(truth: np.ndarray, predictions: np.ndarray) -> np.ndarray:
+        """Return the confusion matrix."""
         return classifier_utils.confusion_matrix(truth, predictions)
 
-    @staticmethod
-    def combine_data(per_frame, window):
-        """combine feature sets together
-
-        Args:
-            per_frame: per frame features dataframe
-            window: window feature dataframe
-
-        Returns:
-            merged dataframe
-        """
-        return classifier_utils.combine_data(per_frame, window)
-
-    def get_feature_importance(self, limit=20) -> list[tuple[str, float]]:
-        """get the most important features and their importance
-
-        Args:
-            limit: maximum number of features to return, defaults to 20
-
-        Returns:
-            list of tuples of feature name and importance
-        """
-        # Get numerical feature importance
-        importances = list(self._classifier.feature_importances_)
-        # List of tuples with variable and importance
-        feature_importance = [
-            (feature, round(importance, 2))
-            for feature, importance in zip(self._feature_names, importances, strict=True)
-        ]
-        # Sort the feature importance by most important first
-        feature_importance = sorted(feature_importance, key=lambda x: x[1], reverse=True)
-        return feature_importance[:limit]
-
-    def print_feature_importance(self, limit=20):
-        """print the most important features and their importance
-
-        Args:
-            limit: maximum number of features to print, defaults to 20
-        """
-        feature_importance = self.get_feature_importance(limit=limit)
-        # Print out the feature and importance
-        print(f"{'Feature Name':100} Importance")
-        print("-" * 120)
-        for feature, importance in feature_importance[:limit]:
-            print(f"{feature:100} {importance:0.2f}")
-
     @staticmethod
     def count_label_threshold(
         all_counts: dict,
         cv_grouping_strategy: CrossValidationGroupingStrategy = DEFAULT_CV_GROUPING_STRATEGY,
     ) -> int:
-        """counts the number of groups that meet label threshold criteria
+        """Count groups that meet the label-threshold criteria.
 
         Args:
-            all_counts: labeled frame and bout counts for the entire
-                project
-
-
-            all_counts is a dict with the following form
-            {
-                '<video name>': {
-                    <identity>: {
-                        "fragmented_frame_counts": (
-                            behavior frame count: fragmented,
-                            not behavior frame count: fragmented),
-                        "fragmented_bout_counts": (
-                            behavior bout count: fragmented,
-                            not behavior bout count: fragmented
-                        ),
-                        "unfragmented_frame_counts": (
-                            behavior frame count: unfragmented,
-                            not behavior frame count: unfragmented
-                        ),
-                        "unfragmented_bout_counts": (
-                            behavior bout count: unfragmented,
-                            not behavior bout count: unfragmented
-                        ),
-                    },
-                }
-            }
-
-            cv_grouping_strategy: cross-validation grouping strategy
+            all_counts: Labeled frame and bout counts for the entire project.
+                Structure is a dict[video_name][identity] of fragmented and
+                unfragmented frame/bout count tuples.
+            cv_grouping_strategy: Cross-validation grouping strategy.
 
         Returns:
-            number of groups that meet label criteria
+            Number of groups that meet the labeling threshold criteria.
 
-        Note: uses "fragmented" label counts, since these reflect the counts of labels that are usable for training
+        Note:
+            Uses "fragmented" label counts since these reflect labels usable
+            for training.
         """
         group_count = 0
         if cv_grouping_strategy == CrossValidationGroupingStrategy.INDIVIDUAL:
@@ -646,64 +351,17 @@ def label_threshold_met(
         min_groups: int,
         cv_grouping_strategy: CrossValidationGroupingStrategy = DEFAULT_CV_GROUPING_STRATEGY,
     ) -> bool:
-        """determine if the labeling threshold is met
+        """Determine whether the labeling threshold is met.
 
         Args:
-            all_counts: labeled frame and bout counts for the entire
-                project
-            min_groups: minimum number of groups required (more than one
-                group is always required for the "leave one group out" train/test split,
-                but may be more than 2 for k-fold cross validation if k > 2)
-            cv_grouping_strategy: cross-validation grouping strategy
+            all_counts: Labeled frame and bout counts for the entire project.
+            min_groups: Minimum number of groups required.
+            cv_grouping_strategy: Cross-validation grouping strategy.
 
         Returns:
-            bool if requested valid groups is > valid group
+            True if there are enough groups meeting the threshold.
         """
         group_count = Classifier.count_label_threshold(
             all_counts, cv_grouping_strategy=cv_grouping_strategy
         )
         return 1 < group_count >= min_groups
-
-    @staticmethod
-    def _supported_classifier_choices() -> set[ClassifierType]:
-        """Determine the list of supported classifier types in the current JABS environment."""
-        return set(_CLASSIFIER_FACTORIES.keys())
-
-    def _clean_features(self, features: pd.DataFrame) -> pd.DataFrame:
-        """Clean features for prediction, handling missing and infinite values.
-
-        Args:
-            features: DataFrame of feature data to clean.
-
-        Returns:
-            Cleaned DataFrame with missing and infinite values handled.
-        """
-        return classifier_utils.clean_features(features, self._classifier_type)
-
-    @staticmethod
-    def derive_predictions(probabilities: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
-        """Derive predicted classes from predicted probabilities.
-
-        Args:
-            probabilities: Array of predicted probabilities for each class.
-
-        Returns:
-            Array of predicted classes. Frames where no pose is detected are assigned -1.
-
-        TODO: Consider returning predictions where there is no pose instead of removing.
-          If the gap in pose is small, accurate predictions might be possible due to window features.
-          Maybe allow a maximum gap size parameter?
-        """
-        # Derive predictions by taking argmax (class with highest probability)
-        # This is equivalent to predict() but avoids duplicate computation
-        predictions = np.argmax(probabilities, axis=1).astype(np.int8)
-
-        # Use predictions as column indexes for each row of prob
-        probabilities = probabilities[np.arange(len(probabilities)), predictions]
-
-        # currently, predict_proba sets probabilities to 0 for frames with no pose
-        # we're going to set the predictions for those frames to -1 (no prediction)
-        no_pose_frames = np.where(probabilities == 0)[0]
-        predictions[no_pose_frames] = -1
-
-        return predictions, probabilities
diff --git a/src/jabs/classifier/classifier_utils.py b/src/jabs/classifier/classifier_utils.py
index ef1b2c02..d76afd9b 100644
--- a/src/jabs/classifier/classifier_utils.py
+++ b/src/jabs/classifier/classifier_utils.py
@@ -21,7 +21,9 @@
 )
 from sklearn.model_selection import LeaveOneGroupOut
 
+from jabs.core.constants import MULTICLASS_NONE_BEHAVIOR
 from jabs.core.enums import ClassifierType
+from jabs.project import TrackLabels
 
 LABEL_THRESHOLD: int = 20
 
@@ -121,6 +123,50 @@ def downsample_balance(
     return features, labels
 
 
+def logo_split_is_valid(
+    test_labels: npt.NDArray,
+    train_labels: npt.NDArray,
+    all_classes: npt.NDArray,
+    label_threshold: int,
+    min_test_classes: int | None,
+) -> bool:
+    """Return True if a leave-one-group-out split satisfies threshold criteria.
+
+    The training portion must always contain every class above
+    ``label_threshold`` so the model can learn each class regardless of the
+    held-out group.
+
+    The test portion criterion depends on ``min_test_classes``:
+
+    - ``None`` (binary default): every class must be above ``label_threshold``.
+    - integer (multi-class): at least ``min_test_classes`` distinct classes
+      must be above ``label_threshold``.
+
+    Args:
+        test_labels: Label array for the test split.
+        train_labels: Label array for the training split.
+        all_classes: All class values to consider (typically ``np.unique`` of
+            the full label array).
+        label_threshold: Minimum number of samples per class required.
+        min_test_classes: Minimum number of distinct classes required in the
+            test split, or ``None`` to require every class.
+
+    Returns:
+        True if the split is acceptable for cross-validation.
+    """
+    train_has_all = all(
+        np.count_nonzero(train_labels == cls) >= label_threshold for cls in all_classes
+    )
+    if not train_has_all:
+        return False
+    if min_test_classes is None:
+        return all(np.count_nonzero(test_labels == cls) >= label_threshold for cls in all_classes)
+    n_test_classes = sum(
+        np.count_nonzero(test_labels == cls) >= label_threshold for cls in all_classes
+    )
+    return n_test_classes >= min_test_classes
+
+
 def leave_one_group_out(
     per_frame_features: pd.DataFrame,
     window_features: pd.DataFrame,
@@ -131,26 +177,8 @@ def leave_one_group_out(
 ) -> Generator[dict, None, None]:
     """Implement the leave-one-group-out data splitting strategy.
 
-    A split is accepted only when **both** the test and training portions satisfy
-    their respective class-count requirements.
-
-    When ``min_test_classes`` is ``None`` (default, binary mode), a split is
-    accepted when:
-
-    - Every class present in the full ``labels`` array appears at least
-      ``label_threshold`` times in the test split.
-    - The training split also contains every class at least ``label_threshold``
-      times (guards against the held-out group being the sole source of a rare
-      class).
-
-    When ``min_test_classes`` is an integer (multi-class mode), a split is
-    accepted when:
-
-    - The test split contains at least ``min_test_classes`` distinct classes each
-      with at least ``label_threshold`` samples.
-    - The training split contains **all** classes at least ``label_threshold``
-      times (so the model can learn every class regardless of what appears in
-      the test split).
+    A split is accepted only when both the test and training portions satisfy
+    :func:`logo_split_is_valid`.
 
     Args:
         per_frame_features: Per-frame feature DataFrame for labeled data.
@@ -179,47 +207,65 @@ def leave_one_group_out(
     random.shuffle(splits)
     count = 0
     for split in splits:
-        test_labels = labels[split[1]]
-        if min_test_classes is None:
-            # Binary mode: all classes must appear above threshold in both splits.
-            test_ok = all(
-                np.count_nonzero(test_labels == cls) >= label_threshold for cls in all_classes
-            )
-            if test_ok:
-                # Also require all classes above threshold in the training split
-                # so the model can learn every class regardless of the test group.
-                train_labels = labels[split[0]]
-                test_ok = all(
-                    np.count_nonzero(train_labels == cls) >= label_threshold for cls in all_classes
-                )
-        else:
-            # Multi-class mode: test split needs at least min_test_classes
-            # classes above threshold; training split must have all classes.
-            n_test_classes = sum(
-                np.count_nonzero(test_labels == cls) >= label_threshold for cls in all_classes
-            )
-            train_labels = labels[split[0]]
-            train_has_all = all(
-                np.count_nonzero(train_labels == cls) >= label_threshold for cls in all_classes
-            )
-            test_ok = n_test_classes >= min_test_classes and train_has_all
-
-        if test_ok:
-            count += 1
-            yield {
-                "training_data": x.iloc[split[0]],
-                "training_labels": labels[split[0]],
-                "training_idx": split[0],
-                "test_data": x.iloc[split[1]],
-                "test_labels": labels[split[1]],
-                "test_idx": split[1],
-                "test_group": groups[split[1]][0],
-                "feature_names": x.columns.to_list(),
-            }
+        if not logo_split_is_valid(
+            labels[split[1]], labels[split[0]], all_classes, label_threshold, min_test_classes
+        ):
+            continue
+        count += 1
+        yield {
+            "training_data": x.iloc[split[0]],
+            "training_labels": labels[split[0]],
+            "training_idx": split[0],
+            "test_data": x.iloc[split[1]],
+            "test_labels": labels[split[1]],
+            "test_idx": split[1],
+            "test_group": groups[split[1]][0],
+            "feature_names": x.columns.to_list(),
+        }
     if count == 0:
         raise ValueError("unable to split data")
 
 
+def count_valid_logo_splits(
+    labels: npt.NDArray,
+    groups: npt.NDArray,
+    label_threshold: int = LABEL_THRESHOLD,
+    min_test_classes: int | None = None,
+) -> int:
+    """Count groups that would yield a valid LOGO split.
+
+    Mirrors the per-split acceptance rule used by :func:`leave_one_group_out`
+    without constructing feature matrices, so callers can pre-flight how many
+    iterations a CV run will produce.
+
+    Args:
+        labels: Label array corresponding to each frame.
+        groups: Group ID array corresponding to each label.
+        label_threshold: Minimum number of samples per class required.
+        min_test_classes: Minimum number of distinct classes required in the
+            test split, or ``None`` to require every class.
+
+    Returns:
+        Number of groups that can serve as a valid LOGO test split.
+    """
+    labels = np.asarray(labels)
+    groups = np.asarray(groups)
+    all_classes = np.unique(labels)
+    unique_groups = np.unique(groups)
+    count = 0
+    for g in unique_groups:
+        test_mask = groups == g
+        if logo_split_is_valid(
+            labels[test_mask],
+            labels[~test_mask],
+            all_classes,
+            label_threshold,
+            min_test_classes,
+        ):
+            count += 1
+    return count
+
+
 def accuracy_score(truth: npt.NDArray, predictions: npt.NDArray) -> float:
     """Compute classification accuracy.
 
@@ -261,3 +307,78 @@ def confusion_matrix(truth: npt.NDArray, predictions: npt.NDArray) -> npt.NDArra
         Confusion matrix as a 2D integer array.
     """
     return _confusion_matrix(truth, predictions)
+
+
+def merge_labels(
+    labels_by_behavior: dict[str, npt.NDArray[np.int8]],
+    behavior_names: list[str],
+) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]:
+    """Merge per-behavior label arrays into a single multi-class label array.
+
+    Merging rules:
+        - ``TrackLabels.Label.BEHAVIOR`` in the ``MULTICLASS_NONE_BEHAVIOR``
+          entry → class 0 (background).
+        - ``TrackLabels.Label.BEHAVIOR`` in behavior X's entry → class index
+          (1-based, by position in ``behavior_names``).
+        - All other frames → excluded (not in the returned mask).
+
+    Args:
+        labels_by_behavior: dict mapping behavior name to a label array of
+            ``TrackLabels.Label`` integer values, one element per frame.
+        behavior_names: Ordered list of N behavior names (must not include
+            ``MULTICLASS_NONE_BEHAVIOR``).
+
+    Returns:
+        Tuple of ``(multiclass_labels, include_mask)`` where:
+
+        - ``multiclass_labels``: integer array of class indices (0..N) for
+          the included frames only, length M <= n_frames.
+        - ``include_mask``: boolean array of length n_frames; True where the
+          frame is included in training.
+
+    Raises:
+        ValueError: If ``labels_by_behavior`` is empty, an entry has an
+            invalid shape, or any frame is labeled ``BEHAVIOR`` for more than
+            one behavior.
+    """
+    if not labels_by_behavior:
+        raise ValueError("labels_by_behavior must not be empty")
+
+    n_frames = next(iter(labels_by_behavior.values())).shape[0]
+
+    for name, arr in labels_by_behavior.items():
+        if arr.ndim != 1:
+            raise ValueError(f"Label array for '{name}' must be 1-D, got shape {arr.shape}")
+        if arr.shape[0] != n_frames:
+            raise ValueError(
+                f"Label array for '{name}' has length {arr.shape[0]}, expected {n_frames}"
+            )
+
+    all_names = [MULTICLASS_NONE_BEHAVIOR, *behavior_names]
+    behavior_mask = np.zeros(n_frames, dtype=np.intp)
+    for name in all_names:
+        if name in labels_by_behavior:
+            behavior_mask += (labels_by_behavior[name] == TrackLabels.Label.BEHAVIOR).astype(
+                np.intp
+            )
+    conflict_frames = np.where(behavior_mask > 1)[0]
+    if len(conflict_frames) > 0:
+        raise ValueError(
+            f"Conflicting BEHAVIOR labels found on {len(conflict_frames)} frame(s): "
+            f"{conflict_frames.tolist()}. Each frame may be labeled for at "
+            f"most one behavior."
+        )
+
+    class_indices = np.full(n_frames, -1, dtype=np.intp)
+
+    if MULTICLASS_NONE_BEHAVIOR in labels_by_behavior:
+        none_arr = labels_by_behavior[MULTICLASS_NONE_BEHAVIOR]
+        class_indices[none_arr == TrackLabels.Label.BEHAVIOR] = 0
+
+    for i, behavior in enumerate(behavior_names, start=1):
+        if behavior in labels_by_behavior:
+            beh_arr = labels_by_behavior[behavior]
+            class_indices[beh_arr == TrackLabels.Label.BEHAVIOR] = i
+
+    include_mask = class_indices >= 0
+    return class_indices[include_mask], include_mask
diff --git a/src/jabs/classifier/cross_validation.py b/src/jabs/classifier/cross_validation.py
index 4159e10d..b387af1e 100644
--- a/src/jabs/classifier/cross_validation.py
+++ b/src/jabs/classifier/cross_validation.py
@@ -4,13 +4,14 @@
 from typing import TYPE_CHECKING, NotRequired, TypedDict
 
 import numpy as np
+import numpy.typing as npt
 import pandas as pd
 from sklearn.metrics import precision_recall_fscore_support
 
 from jabs.core.constants import MULTICLASS_NONE_BEHAVIOR
 
 from . import classifier_utils
-from .training_report import CrossValidationResult
+from .training_report import BinaryCVResult, CrossValidationResult, MultiClassCVResult
 
 if TYPE_CHECKING:
     from jabs.classifier import Classifier, MultiClassClassifier
@@ -27,6 +28,175 @@ class CVFeatures(TypedDict):
     labels_by_behavior: NotRequired[dict[str, np.ndarray]]
 
 
+def _prepare_cv_labels(
+    classifier: "Classifier | MultiClassClassifier",
+    features: CVFeatures,
+    project: "Project",
+    is_multiclass: bool,
+) -> tuple[npt.NDArray, list[str] | None, dict | None]:
+    """Compute the label array, class names, and settings used for CV.
+
+    In binary mode the labels come straight from the features payload and no
+    class-name or settings preparation is needed. In multi-class mode we merge
+    per-behavior label arrays into a class-index array and capture the effective
+    training settings the classifier should reuse per fold.
+    """
+    if not is_multiclass:
+        return features["labels"], None, None
+
+    behavior_names = list(getattr(classifier, "behavior_names", []))
+    labels, _ = classifier_utils.merge_labels(features["labels_by_behavior"], behavior_names)
+    class_names = [MULTICLASS_NONE_BEHAVIOR, *behavior_names]
+    multiclass_settings = classifier.project_settings or project.get_project_defaults()
+    return labels, class_names, multiclass_settings
+
+
+def _resolve_k(
+    classifier: "Classifier | MultiClassClassifier",
+    labels: npt.NDArray,
+    groups: npt.NDArray,
+    k: int | float,
+    emit_status: Callable[[str], None],
+) -> int:
+    """Resolve the requested CV iteration count against available valid splits.
+
+    Returns 0 when no valid splits exist or the caller asked for none, signaling
+    that cross-validation should be skipped.
+    """
+    if k <= 0:
+        return 0
+    max_splits = classifier.get_leave_one_group_out_max(labels, groups)
+    if max_splits == 0:
+        emit_status("No valid cross-validation splits found; skipping CV")
+        return 0
+    if k == np.inf:
+        return max_splits
+    if k > max_splits:
+        emit_status(
+            f"Requested {k} cross-validation splits, but only {max_splits} are valid; "
+            f"using {max_splits}"
+        )
+        return max_splits
+    return int(k)
+
+
+def _train_binary_fold(
+    classifier: "Classifier",
+    project: "Project",
+    behavior: str,
+    data: dict,
+) -> None:
+    """Train a binary classifier on the training portion of one CV fold."""
+    classifier.set_project_settings(project)
+    classifier.behavior_name = behavior
+    classifier.train(data)
+
+
+def _train_multiclass_fold(
+    classifier: "MultiClassClassifier",
+    data: dict,
+    features: CVFeatures,
+    multiclass_settings: dict,
+) -> None:
+    """Train a multi-class classifier on the training portion of one CV fold."""
+    train_idx = data["training_idx"]
+    labels_by_behavior = {
+        name: arr[train_idx] for name, arr in features["labels_by_behavior"].items()
+    }
+    classifier.train(
+        {
+            "per_frame": features["per_frame"].iloc[train_idx],
+            "window": features["window"].iloc[train_idx],
+            "labels_by_behavior": labels_by_behavior,
+            "settings": multiclass_settings,
+            "feature_names": data["feature_names"],
+        }
+    )
+
+
+def _test_label_from_group(test_info: dict) -> str:
+    """Render a CV test-group label for the report (video name + optional identity)."""
+    if test_info["identity"] is not None:
+        return f"{test_info['video']} [{test_info['identity']}]"
+    return test_info["video"]
+
+
+def _build_binary_cv_result(
+    iteration: int,
+    test_label: str,
+    accuracy: float,
+    confusion: npt.NDArray,
+    top_features: list[tuple[str, float]],
+    data: dict,
+    predictions: npt.NDArray,
+) -> BinaryCVResult:
+    """Construct a binary CV iteration result from prediction outputs."""
+    pr = classifier_utils.precision_recall_score(data["test_labels"], predictions)
+    return BinaryCVResult(
+        iteration=iteration,
+        test_label=test_label,
+        accuracy=accuracy,
+        confusion_matrix=confusion,
+        top_features=top_features,
+        precision_behavior=float(pr[0][1]),
+        precision_not_behavior=float(pr[0][0]),
+        recall_behavior=float(pr[1][1]),
+        recall_not_behavior=float(pr[1][0]),
+        f1_behavior=float(pr[2][1]),
+        support_behavior=int(pr[3][1]),
+        support_not_behavior=int(pr[3][0]),
+    )
+
+
+def _build_multiclass_cv_result(
+    iteration: int,
+    test_label: str,
+    accuracy: float,
+    confusion: npt.NDArray,
+    top_features: list[tuple[str, float]],
+    data: dict,
+    predictions: npt.NDArray,
+    class_names: list[str],
+) -> MultiClassCVResult:
+    """Construct a multi-class CV iteration result from prediction outputs."""
+    class_idx = np.arange(len(class_names))
+    precision, recall, f1, support = precision_recall_fscore_support(
+        data["test_labels"], predictions, labels=class_idx, zero_division=0
+    )
+    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
+        data["test_labels"], predictions, average="macro", zero_division=0
+    )
+    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
+        data["test_labels"], predictions, average="micro", zero_division=0
+    )
+    per_class_metrics = [
+        {
+            "class_name": name,
+            "precision": float(precision[idx]),
+            "recall": float(recall[idx]),
+            "f1": float(f1[idx]),
+            "support": int(support[idx]),
+        }
+        for idx, name in enumerate(class_names)
+    ]
+    return MultiClassCVResult(
+        iteration=iteration,
+        test_label=test_label,
+        accuracy=accuracy,
+        confusion_matrix=confusion,
+        top_features=top_features,
+        class_names=class_names,
+        class_support=[int(x) for x in support],
+        per_class_metrics=per_class_metrics,
+        precision_macro=float(precision_macro),
+        recall_macro=float(recall_macro),
+        f1_macro=float(f1_macro),
+        precision_micro=float(precision_micro),
+        recall_micro=float(recall_micro),
+        f1_micro=float(f1_micro),
+    )
+
+
 def run_leave_one_group_out_cv(
     classifier: "Classifier | MultiClassClassifier",
     project: "Project",
@@ -38,170 +208,93 @@ def run_leave_one_group_out_cv(
     progress_callback: Callable[[], None] | None = None,
     terminate_callback: Callable[[], None] | None = None,
 ) -> list[CrossValidationResult]:
-    """
-    Run leave-one-group-out cross-validation for a classifier.
+    """Run leave-one-group-out cross-validation for a classifier.
 
     Args:
         classifier: Classifier instance to train.
         project: Project instance containing data and settings.
-        features: Dictionary containing features and labels
-        group_mapping: Mapping of cross validation groups to labeled feature rows.
-        behavior: Behavior label to train on.
-        k: Number of cross-validation splits (int or np.inf for all splits).
+        features: Dictionary containing features and labels.
+        group_mapping: Mapping of cross-validation groups to labeled feature rows.
+        behavior: Behavior label to train on (binary mode only).
+        k: Number of cross-validation splits (int or ``np.inf`` for all splits).
         status_callback: Optional callback for status updates (str argument).
         progress_callback: Optional callback for progress updates (no arguments).
-        terminate_callback: Optional callback to check for early termination (no arguments, should raise if terminate requested).
+        terminate_callback: Optional callback to check for early termination
+            (no arguments, should raise if termination is requested).
 
     Returns:
-        List of CrossValidationResult instances summarizing cross-validation results.
+        List of cross-validation iteration results.
     """
 
-    def emit_status(msg):
+    def emit_status(msg: str) -> None:
         if status_callback:
             status_callback(msg)
 
-    def emit_progress():
+    def emit_progress() -> None:
         if progress_callback:
             progress_callback()
         if terminate_callback:
             terminate_callback()
 
     is_multiclass = "labels_by_behavior" in features
-    labels = features.get("labels")
-    class_names: list[str] | None = None
-    multiclass_settings: dict | None = None
-
-    if is_multiclass:
-        behavior_names = list(getattr(classifier, "behavior_names", []))
-        labels, _ = classifier.merge_labels(features["labels_by_behavior"], behavior_names)
-        class_names = [MULTICLASS_NONE_BEHAVIOR, *behavior_names]
-        multiclass_settings = classifier.project_settings or project.get_project_defaults()
-
-    cv_results = []
-    if k > 0:
-        max_splits = classifier.get_leave_one_group_out_max(labels, features["groups"])
-        if max_splits == 0:
-            emit_status("No valid cross-validation splits found; skipping CV")
-            return cv_results
-        if k == np.inf:
-            k = max_splits
-        elif k > max_splits:
-            emit_status(
-                f"Requested {k} cross-validation splits, but only {max_splits} are valid; using {max_splits}"
-            )
-            k = max_splits
+    labels, class_names, multiclass_settings = _prepare_cv_labels(
+        classifier, features, project, is_multiclass
+    )
+
+    cv_results: list[CrossValidationResult] = []
+    k = _resolve_k(classifier, labels, features["groups"], k, emit_status)
+    if k == 0:
+        return cv_results
 
     emit_status("Generating train/test splits")
     data_generator = classifier.leave_one_group_out(
-        features["per_frame"],
-        features["window"],
-        labels,
-        features["groups"],
+        features["per_frame"], features["window"], labels, features["groups"]
     )
 
-    if k > 0:
-        for i, data in enumerate(data_generator):
-            if terminate_callback:
-                terminate_callback()
-            if i + 1 > k:
-                break
-            emit_status(f"cross validation iteration {i + 1} of {k}")
-            test_info = group_mapping[data["test_group"]]
-            if is_multiclass:
-                if multiclass_settings is None:
-                    raise RuntimeError("Internal error: multiclass settings were not initialized")
-                train_idx = data["training_idx"]
-                labels_by_behavior = {
-                    name: arr[train_idx] for name, arr in features["labels_by_behavior"].items()
-                }
-                classifier.train(
-                    {
-                        "per_frame": features["per_frame"].iloc[train_idx],
-                        "window": features["window"].iloc[train_idx],
-                        "labels_by_behavior": labels_by_behavior,
-                        "settings": multiclass_settings,
-                        "feature_names": data["feature_names"],
-                    }
-                )
-            else:
-                classifier.set_project_settings(project)
-                classifier.behavior_name = behavior
-                classifier.train(data)
-            predictions = classifier.predict(data["test_data"])
-            accuracy = classifier_utils.accuracy_score(data["test_labels"], predictions)
-            confusion = classifier_utils.confusion_matrix(data["test_labels"], predictions)
-            top_features = classifier.get_feature_importance(limit=10)
-            test_label = (
-                f"{test_info['video']} [{test_info['identity']}]"
-                if test_info["identity"] is not None
-                else test_info["video"]
-            )
+    for i, data in enumerate(data_generator):
+        if terminate_callback:
+            terminate_callback()
+        if i + 1 > k:
+            break
+        emit_status(f"cross validation iteration {i + 1} of {k}")
 
-            if is_multiclass and class_names is not None:
-                class_idx = np.arange(len(class_names))
-                precision, recall, f1, support = precision_recall_fscore_support(
-                    data["test_labels"],
-                    predictions,
-                    labels=class_idx,
-                    zero_division=0,
-                )
-                precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
-                    data["test_labels"],
+        if is_multiclass:
+            if multiclass_settings is None:
+                raise RuntimeError("Internal error: multiclass settings were not initialized")
+            _train_multiclass_fold(classifier, data, features, multiclass_settings)
+        else:
+            _train_binary_fold(classifier, project, behavior, data)
+
+        predictions = classifier.predict(data["test_data"])
+        accuracy = classifier_utils.accuracy_score(data["test_labels"], predictions)
+        confusion = classifier_utils.confusion_matrix(data["test_labels"], predictions)
+        top_features = classifier.get_feature_importance(limit=10)
+        test_label = _test_label_from_group(group_mapping[data["test_group"]])
+
+        if is_multiclass and class_names is not None:
+            cv_results.append(
+                _build_multiclass_cv_result(
+                    i + 1,
+                    test_label,
+                    accuracy,
+                    confusion,
+                    top_features,
+                    data,
                     predictions,
-                    average="macro",
-                    zero_division=0,
+                    class_names,
                 )
-                precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
-                    data["test_labels"],
+            )
+        else:
+            cv_results.append(
+                _build_binary_cv_result(
+                    i + 1,
+                    test_label,
+                    accuracy,
+                    confusion,
+                    top_features,
+                    data,
                     predictions,
-                    average="micro",
-                    zero_division=0,
                 )
-                per_class_metrics = [
-                    {
-                        "class_name": name,
-                        "precision": float(precision[idx]),
-                        "recall": float(recall[idx]),
-                        "f1": float(f1[idx]),
-                        "support": int(support[idx]),
-                    }
-                    for idx, name in enumerate(class_names)
-                ]
-                cv_results.append(
-                    CrossValidationResult(
-                        iteration=i + 1,
-                        test_label=test_label,
-                        accuracy=accuracy,
-                        confusion_matrix=confusion,
-                        top_features=top_features,
-                        class_names=class_names,
-                        class_support=[int(x) for x in support],
-                        per_class_metrics=per_class_metrics,
-                        precision_macro=float(precision_macro),
-                        recall_macro=float(recall_macro),
-                        f1_macro=float(f1_macro),
-                        precision_micro=float(precision_micro),
-                        recall_micro=float(recall_micro),
-                        f1_micro=float(f1_micro),
-                    )
-                )
-            else:
-                pr = classifier_utils.precision_recall_score(data["test_labels"], predictions)
-                cv_results.append(
-                    CrossValidationResult(
-                        iteration=i + 1,
-                        test_label=test_label,
-                        accuracy=accuracy,
-                        precision_behavior=float(pr[0][1]),
-                        precision_not_behavior=float(pr[0][0]),
-                        recall_behavior=float(pr[1][1]),
-                        recall_not_behavior=float(pr[1][0]),
-                        f1_behavior=float(pr[2][1]),
-                        support_behavior=int(pr[3][1]),
-                        support_not_behavior=int(pr[3][0]),
-                        confusion_matrix=confusion,
-                        top_features=top_features,
-                    )
-                )
-            emit_progress()
+            )
+        emit_progress()
     return cv_results
diff --git a/src/jabs/classifier/factories.py b/src/jabs/classifier/factories.py
index 5a820f2b..fe7e3a21 100644
--- a/src/jabs/classifier/factories.py
+++ b/src/jabs/classifier/factories.py
@@ -1,18 +1,20 @@
-"""Factory functions for various classifiers.
+"""Factory functions and registry for behavior classifiers.
 
 ``XGBOOST_AVAILABLE`` is set at import time by probing for the ``xgboost``
-package.  Both ``Classifier`` and ``MultiClassClassifier`` import this flag
-to conditionally register XGBoost support, so the availability check and
-warning are emitted exactly once regardless of how many classifier modules
-are imported.
+package. The factory registry below maps each ``ClassifierType`` to its
+constructor for both binary and multi-class modes; mode-specific lookup is
+exposed via :func:`get_factory` and :func:`supported_classifier_types`.
 """
 
 import logging
+import typing
 
 from catboost import CatBoostClassifier
 from sklearn.base import ClassifierMixin
 from sklearn.ensemble import RandomForestClassifier
 
+from jabs.core.enums import ClassifierType
+
 logger = logging.getLogger(__name__)
 
 try:
@@ -27,49 +29,52 @@
     )
 
 
+ClassifierFactory = typing.Callable[[int, int | None], typing.Any]
+
+
 def make_random_forest(n_jobs: int, random_seed: int | None) -> RandomForestClassifier:
-    """Factory function to construct a RandomForest classifier.
+    """Construct a RandomForest classifier.
 
     Args:
-        n_jobs (int): Number of parallel jobs.
-        random_seed (int | None): Random seed for reproducibility.
+        n_jobs: Number of parallel jobs.
+        random_seed: Random seed for reproducibility.
 
     Returns:
-        RandomForestClassifier: An instance of RandomForestClassifier.
+        A configured ``RandomForestClassifier``.
     """
     return RandomForestClassifier(n_jobs=n_jobs, random_state=random_seed)
 
 
 def make_catboost(n_jobs: int, random_seed: int | None) -> CatBoostClassifier:
-    """Factory function to construct a CatBoost classifier.
+    """Construct a CatBoost classifier for binary classification.
 
     Args:
-        n_jobs (int): Number of parallel jobs.
-        random_seed (int | None): Random seed for reproducibility.
+        n_jobs: Number of parallel jobs.
+        random_seed: Random seed for reproducibility.
 
     Returns:
-        CatBoostClassifier: An instance of CatBoostClassifier.
+        A configured ``CatBoostClassifier``.
     """
     return CatBoostClassifier(
         thread_count=n_jobs,
         random_state=random_seed,
-        verbose=False,  # Suppress training output
-        allow_writing_files=False,  # Don't write intermediate files
+        verbose=False,
+        allow_writing_files=False,
     )
 
 
 def make_catboost_multiclass(n_jobs: int, random_seed: int | None) -> CatBoostClassifier:
-    """Factory function to construct a CatBoost classifier for multi-class problems.
+    """Construct a CatBoost classifier for multi-class classification.
 
-    Uses ``loss_function="MultiClass"`` (softmax over all classes), which is
-    required when the label set has more than two classes.
+    Uses ``loss_function="MultiClass"`` (softmax over all classes), required
+    when the label set has more than two classes.
 
     Args:
         n_jobs: Number of parallel jobs.
         random_seed: Random seed for reproducibility.
 
     Returns:
-        CatBoostClassifier configured for multi-class classification.
+        A configured ``CatBoostClassifier`` with multi-class loss.
     """
     return CatBoostClassifier(
         loss_function="MultiClass",
@@ -81,21 +86,21 @@ def make_catboost_multiclass(n_jobs: int, random_seed: int | None) -> CatBoostCl
 
 
 def make_xgboost(n_jobs: int, random_seed: int | None) -> ClassifierMixin:
-    """Factory function to construct an XGBoost classifier.
+    """Construct an XGBoost classifier.
 
-    XGBoost might not be available in all environments (such as macOS without
-    libomp installed), so we try to import here.
+    XGBoost may not be available in all environments (e.g., macOS without
+    libomp), so the import is deferred to call time.
 
     Args:
-        n_jobs (int): Number of parallel jobs.
-        random_seed (int | None): Random seed for reproducibility.
+        n_jobs: Number of parallel jobs.
+        random_seed: Random seed for reproducibility.
 
     Returns:
-        An instance of XGBClassifier. Note: type hint is ClassifierMixin to avoid
-        direct dependency on xgboost in type hints.
+        A configured ``XGBClassifier``. Typed as ``ClassifierMixin`` to avoid a
+        hard dependency on xgboost in type hints.
 
     Raises:
-        RuntimeError: If XGBoost is not available.
+        RuntimeError: If XGBoost is not available in the current environment.
     """
     try:
         import xgboost
@@ -104,3 +109,52 @@ def make_xgboost(n_jobs: int, random_seed: int | None) -> ClassifierMixin:
             "XGBoost classifier requested but 'xgboost' is not available in this environment."
         ) from e
     return xgboost.XGBClassifier(n_jobs=n_jobs, random_state=random_seed)
+
+
+_BINARY_FACTORIES: dict[ClassifierType, ClassifierFactory] = {
+    ClassifierType.RANDOM_FOREST: make_random_forest,
+    ClassifierType.CATBOOST: make_catboost,
+}
+
+_MULTICLASS_FACTORIES: dict[ClassifierType, ClassifierFactory] = {
+    ClassifierType.RANDOM_FOREST: make_random_forest,
+    ClassifierType.CATBOOST: make_catboost_multiclass,
+}
+
+if XGBOOST_AVAILABLE:
+    _BINARY_FACTORIES[ClassifierType.XGBOOST] = make_xgboost
+    _MULTICLASS_FACTORIES[ClassifierType.XGBOOST] = make_xgboost
+
+
+def get_factory(classifier_type: ClassifierType, *, multiclass: bool) -> ClassifierFactory:
+    """Look up the factory function for a classifier type and mode.
+
+    Args:
+        classifier_type: Which underlying algorithm to construct.
+        multiclass: True for multi-class mode, False for binary mode.
+
+    Returns:
+        The factory callable producing an instance of the requested type.
+
+    Raises:
+        ValueError: If the classifier type is not supported in the current
+            environment for the requested mode.
+    """
+    table = _MULTICLASS_FACTORIES if multiclass else _BINARY_FACTORIES
+    try:
+        return table[classifier_type]
+    except KeyError:
+        raise ValueError(f"Unsupported classifier type: {classifier_type!r}") from None
+
+
+def supported_classifier_types(*, multiclass: bool) -> set[ClassifierType]:
+    """Return the set of classifier types available in the current environment.
+
+    Args:
+        multiclass: True for multi-class mode, False for binary mode.
+
+    Returns:
+        Set of supported ``ClassifierType`` values.
+    """
+    table = _MULTICLASS_FACTORIES if multiclass else _BINARY_FACTORIES
+    return set(table.keys())
diff --git a/src/jabs/classifier/multi_class_classifier.py b/src/jabs/classifier/multi_class_classifier.py
index 0cfe4ba1..72245d1e 100644
--- a/src/jabs/classifier/multi_class_classifier.py
+++ b/src/jabs/classifier/multi_class_classifier.py
@@ -3,44 +3,31 @@
 from __future__ import annotations
 
 import logging
-import typing
 import warnings
 from collections.abc import Generator
 from pathlib import Path
+from typing import ClassVar
 
-import joblib
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
-from sklearn.exceptions import InconsistentVersionWarning
 
 from jabs.core.constants import MULTICLASS_NONE_BEHAVIOR
-from jabs.core.enums import ClassifierType
+from jabs.core.enums import (
+    DEFAULT_CV_GROUPING_STRATEGY,
+    ClassifierType,
+    CrossValidationGroupingStrategy,
+)
 from jabs.core.utils import hash_file
-from jabs.project import TrackLabels, load_multiclass_training_data
+from jabs.project import load_multiclass_training_data
 
 from . import classifier_utils
-from .factories import (
-    XGBOOST_AVAILABLE,
-    make_catboost_multiclass,
-    make_random_forest,
-    make_xgboost,
-)
+from .base import BaseClassifier
 
 logger = logging.getLogger(__name__)
 
-_VERSION = 1
-
-_CLASSIFIER_FACTORIES: dict[ClassifierType, typing.Callable[[int, int | None], typing.Any]] = {
-    ClassifierType.RANDOM_FOREST: make_random_forest,
-    ClassifierType.CATBOOST: make_catboost_multiclass,
-}
 
-if XGBOOST_AVAILABLE:
-    _CLASSIFIER_FACTORIES[ClassifierType.XGBOOST] = make_xgboost
-
-
-class MultiClassClassifier:
+class MultiClassClassifier(BaseClassifier):
     """Multi-class behavior classifier for simultaneous classification of N behaviors.
 
     Trains a single classifier over all annotated behaviors, outputting one of N
@@ -59,20 +46,31 @@ class MultiClassClassifier:
         - All other frames → excluded from training
 
     Cross-validation note:
-        Leave-one-group-out CV uses a relaxed split criterion compared to binary
-        mode. Because individual groups (videos or animals) are often labeled for
-        only a subset of behaviors, requiring all classes in every test split would
-        yield no valid splits. Instead, a test split is accepted when it contains at
-        least 2 classes above ``LABEL_THRESHOLD`` and the remaining training groups
-        collectively contain all classes above ``LABEL_THRESHOLD``. A follow-up
-        ticket should introduce a multi-video grouping strategy that aggregates
-        groups to improve class coverage in test splits.
+        Leave-one-group-out CV uses a relaxed split criterion compared to
+        binary mode. Because individual groups (videos or animals) are often
+        labeled for only a subset of behaviors, requiring all classes in every
+        test split would yield no valid splits. Instead, a test split is
+        accepted when it contains at least 2 classes above ``LABEL_THRESHOLD``
+        and the remaining training groups collectively contain all classes
+        above ``LABEL_THRESHOLD``. A follow-up ticket should introduce a
+        multi-video grouping strategy that aggregates groups to improve class
+        coverage in test splits.
 
     Attributes:
         LABEL_THRESHOLD: Minimum number of labeled frames required per class.
     """
 
-    LABEL_THRESHOLD: int = classifier_utils.LABEL_THRESHOLD
+    LABEL_THRESHOLD: ClassVar[int] = classifier_utils.LABEL_THRESHOLD
+
+    _VERSION: ClassVar[int] = 1
+    _MULTICLASS: ClassVar[bool] = True
+    _PERSISTED_REQUIRED: ClassVar[tuple[str, ...]] = (
+        "_classifier",
+        "_behavior_names",
+        "_classifier_type",
+        "_feature_names",
+    )
+    _PERSISTED_OPTIONAL: ClassVar[tuple[str, ...]] = ("_project_settings",)
 
     def __init__(
         self,
@@ -90,9 +88,10 @@ def __init__(
             n_jobs: Number of parallel jobs for training and inference.
 
         Raises:
-            ValueError: If ``behavior_names`` is empty, contains duplicates, or
-                includes the reserved name ``MULTICLASS_NONE_BEHAVIOR``.
-            ValueError: If ``classifier_type`` is not supported in the current environment.
+            ValueError: If ``behavior_names`` is empty, contains duplicates,
+                or includes the reserved name ``MULTICLASS_NONE_BEHAVIOR``.
+            ValueError: If ``classifier_type`` is not supported in the current
+                environment.
         """
         if not behavior_names:
             raise ValueError("behavior_names must not be empty")
@@ -102,73 +101,15 @@ def __init__(
             )
         if len(behavior_names) != len(set(behavior_names)):
             raise ValueError("behavior_names must not contain duplicate entries")
-        if classifier_type not in self._supported_classifier_choices():
-            raise ValueError("Invalid classifier type")
 
+        super().__init__(classifier_type=classifier_type, n_jobs=n_jobs)
         self._behavior_names: list[str] = list(behavior_names)
-        self._classifier_type = classifier_type
-        self._n_jobs = n_jobs
-        self._classifier = None
-        self._feature_names: list[str] | None = None
-        self._project_settings: dict | None = None
-        self._behavior: str | None = None
-        self._version = _VERSION
-        self._classifier_file: str | None = None
-        self._classifier_hash: str | None = None
-        self._classifier_source: str | None = None
-
-    @property
-    def classifier_name(self) -> str:
-        """Return the classifier algorithm name as a string."""
-        return self._classifier_type.value
 
     @property
     def behavior_names(self) -> list[str]:
         """Ordered list of behavior names (does not include ``MULTICLASS_NONE_BEHAVIOR``)."""
         return list(self._behavior_names)
 
-    @property
-    def feature_names(self) -> list[str] | None:
-        """Feature names used when training this classifier."""
-        return self._feature_names
-
-    @property
-    def classifier_type(self) -> ClassifierType:
-        """Underlying classifier algorithm."""
-        return self._classifier_type
-
-    @property
-    def classifier_file(self) -> str | None:
-        """Return the filename of the saved classifier."""
-        return self._classifier_file
-
-    @property
-    def classifier_hash(self) -> str | None:
-        """Return the content hash of the saved classifier."""
-        return self._classifier_hash
-
-    @property
-    def project_settings(self) -> dict:
-        """Return a copy of classifier settings used for training."""
-        if self._project_settings is not None:
-            return dict(self._project_settings)
-        return {}
-
-    @property
-    def behavior_name(self) -> str | None:
-        """Return the selected behavior name, if any."""
-        return self._behavior
-
-    @behavior_name.setter
-    def behavior_name(self, value: str | None) -> None:
-        """Set the selected behavior name."""
-        self._behavior = value
-
-    @property
-    def version(self) -> int:
-        """Return the serialized classifier format version."""
-        return self._version
-
     def get_class_names(self) -> list[str]:
         """Return the ordered list of class names for this classifier.
 
@@ -177,52 +118,10 @@ def get_class_names(self) -> list[str]:
         """
         return [MULTICLASS_NONE_BEHAVIOR, *self._behavior_names]
 
-    @staticmethod
-    def combine_data(per_frame: pd.DataFrame, window: pd.DataFrame) -> pd.DataFrame:
-        """Combine per-frame and window feature matrices into one DataFrame."""
-        return classifier_utils.combine_data(per_frame, window)
-
-    @staticmethod
-    def derive_predictions(probabilities: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
-        """Derive class predictions and confidence from class probabilities."""
-        predictions = np.argmax(probabilities, axis=1).astype(np.int8)
-        confidence = probabilities[np.arange(len(probabilities)), predictions]
-        predictions[confidence == 0] = -1
-        return predictions, confidence
-
-    def set_classifier(self, classifier: ClassifierType) -> None:
-        """Switch the underlying classifier algorithm."""
-        if classifier not in self._supported_classifier_choices():
-            raise ValueError("Invalid Classifier Type")
-        self._classifier_type = classifier
-
     def set_project_settings(self, project) -> None:
         """Copy project defaults as classifier settings."""
         self._project_settings = dict(project.get_project_defaults())
 
-    def set_dict_settings(self, settings: dict) -> None:
-        """Assign classifier settings from a dictionary."""
-        self._project_settings = dict(settings)
-
-    def classifier_choices(self) -> dict[ClassifierType, str]:
-        """Return the available classifier types."""
-        supported = self._supported_classifier_choices()
-        return {t: t.value for t in sorted(supported, key=lambda t: t.value)}
-
-    def get_feature_importance(self, limit: int = 20) -> list[tuple[str, float]]:
-        """Return ranked feature importances, highest first."""
-        if self._classifier is None or self._feature_names is None:
-            return []
-        if not hasattr(self._classifier, "feature_importances_"):
-            return []
-        importances = list(np.asarray(self._classifier.feature_importances_).reshape(-1))
-        feature_importance = [
-            (feature, round(importance, 2))
-            for feature, importance in zip(self._feature_names, importances, strict=True)
-        ]
-        feature_importance.sort(key=lambda x: x[1], reverse=True)
-        return feature_importance[:limit]
-
     def train(self, data: dict, random_seed: int | None = None) -> None:
         """Train the multi-class classifier.
 
@@ -250,10 +149,10 @@ def train(self, data: dict, random_seed: int | None = None) -> None:
 
         settings = data.get("settings", self._project_settings or {})
         # Persist the effective training settings so downstream classification
-        # can consistently reuse the same feature/postprocessing parameters.
+        # consistently reuses the same feature/postprocessing parameters.
         self._project_settings = dict(settings)
 
-        multiclass_labels, include_mask = self.merge_labels(
+        multiclass_labels, include_mask = classifier_utils.merge_labels(
             data["labels_by_behavior"], self._behavior_names
         )
 
@@ -285,7 +184,7 @@ def train(self, data: dict, random_seed: int | None = None) -> None:
             )
 
         clf = self._create_classifier(random_seed=random_seed)
-        cleaned = classifier_utils.clean_features(features, self._classifier_type)
+        cleaned = self._clean_features(features)
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", category=FutureWarning)
             self._classifier = clf.fit(cleaned, multiclass_labels)
@@ -304,16 +203,14 @@ def predict(
 
         Args:
             features: DataFrame of feature data.
-            frame_indexes: Indexes of frames with valid pose data. Frames absent
-                from this array receive a prediction of -1 (no pose).
+            frame_indexes: Indexes of frames with valid pose data. Frames
+                absent from this array receive a prediction of -1 (no pose).
 
         Returns:
             Integer array of shape ``(n_frames,)`` with class indices 0..N,
             or -1 for frames with no pose data.
         """
-        cleaned = self._get_features_to_classify(
-            classifier_utils.clean_features(features, self._classifier_type)
-        )
+        cleaned = self._get_features_to_classify(self._clean_features(features))
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", category=FutureWarning)
             # ravel() normalizes CatBoost MultiClass, which returns (n, 1).
@@ -335,16 +232,15 @@ def predict_proba(
 
         Args:
             features: DataFrame of feature data.
-            frame_indexes: Indexes of frames with valid pose data. Frames absent
-                from this array receive zero probability across all classes.
+            frame_indexes: Indexes of frames with valid pose data. Frames
+                absent from this array receive zero probability across all
+                classes.
 
         Returns:
             Float array of shape ``(n_frames, N+1)`` where N is the number of
             behaviors. Column 0 is the ``MULTICLASS_NONE_BEHAVIOR`` class.
         """
-        cleaned = self._get_features_to_classify(
-            classifier_utils.clean_features(features, self._classifier_type)
-        )
+        cleaned = self._get_features_to_classify(self._clean_features(features))
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", category=FutureWarning)
             result = self._classifier.predict_proba(cleaned).astype(np.float32)
@@ -356,112 +252,26 @@ def predict_proba(
 
         return result
 
-    def save(self, path: Path) -> None:
-        """Serialize the classifier to disk using joblib.
-
-        Args:
-            path: Destination file path.
-        """
-        joblib.dump(self, path)
-        if self._classifier_file is None:
-            self._classifier_file = Path(path).name
-            self._classifier_hash = hash_file(Path(path))
-            self._classifier_source = "serialized"
+    def save(self, path):
+        """Serialize the classifier to disk and log the destination."""
+        super().save(path)
         logger.info("MultiClassClassifier saved to %s", path)
 
-    def load(self, path: Path) -> None:
-        """Deserialize a classifier from disk, updating this instance in place.
-
-        Args:
-            path: Source file path.
-
-        Raises:
-            ValueError: If the file is not a ``MultiClassClassifier``, was saved
-                with a different version, or uses an unsupported classifier type.
-        """
-        with warnings.catch_warnings(record=True) as caught_warnings:
-            warnings.simplefilter("always", InconsistentVersionWarning)
-            c = joblib.load(path)
-            for warning in caught_warnings:
-                if issubclass(warning.category, InconsistentVersionWarning):
-                    raise ValueError("Classifier trained with different version of sklearn.")
-                else:
-                    warnings.warn(warning.message, warning.category, stacklevel=2)
-
-        if not isinstance(c, MultiClassClassifier):
-            raise ValueError(f"{path} is not an instance of MultiClassClassifier")
-
-        if c._version != _VERSION:
-            raise ValueError(
-                f"Unable to deserialize pickled classifier. "
-                f"File version {c._version}, expected {_VERSION}."
-            )
-
-        if c._classifier_type not in self._supported_classifier_choices():
-            raise ValueError("Invalid classifier type")
-
-        self._classifier = c._classifier
-        self._behavior_names = c._behavior_names
-        self._classifier_type = c._classifier_type
-        self._feature_names = c._feature_names
-        self._project_settings = getattr(c, "_project_settings", None)
-        self._behavior = getattr(c, "_behavior", None)
-        if c._classifier_file is not None:
-            self._classifier_file = c._classifier_file
-            self._classifier_hash = c._classifier_hash
-            self._classifier_source = c._classifier_source
-        else:
-            self._classifier_file = Path(path).name
-            self._classifier_hash = hash_file(Path(path))
-            self._classifier_source = "pickle"
-
+    def load(self, path):
+        """Deserialize a classifier from disk and log the source."""
+        super().load(path)
         logger.info("MultiClassClassifier loaded from %s", path)
 
     @classmethod
     def from_pickle(cls, path: Path) -> MultiClassClassifier:
-        """Load a MultiClassClassifier from a pickle file with full validation and metadata backfill.
-
-        Applies the same version, classifier-type, and metadata checks as ``load()``,
-        but as a classmethod factory so no dummy instance is required.
-
-        Args:
-            path: Path to the saved classifier pickle file.
-
-        Returns:
-            Loaded and validated ``MultiClassClassifier`` instance.
+        """Load a MultiClassClassifier from a pickle file and log the source.
 
-        Raises:
-            ValueError: If the file is not a ``MultiClassClassifier``, was saved
-                with a different version, or uses an unsupported classifier type.
+        Thin wrapper over :meth:`BaseClassifier.from_pickle` that adds an
+        informational log entry; all validation lives in the base class.
         """
-        with warnings.catch_warnings(record=True) as caught_warnings:
-            warnings.simplefilter("always", InconsistentVersionWarning)
-            c = joblib.load(path)
-            for warning in caught_warnings:
-                if issubclass(warning.category, InconsistentVersionWarning):
-                    raise ValueError("Classifier trained with different version of sklearn.")
-                else:
-                    warnings.warn(warning.message, warning.category, stacklevel=2)
-
-        if not isinstance(c, cls):
-            raise ValueError(f"{path} is not an instance of MultiClassClassifier")
-
-        if c._version != _VERSION:
-            raise ValueError(
-                f"Unable to deserialize pickled classifier. "
-                f"File version {c._version}, expected {_VERSION}."
-            )
-
-        if c._classifier_type not in cls._supported_classifier_choices():
-            raise ValueError("Invalid classifier type")
-
-        if c._classifier_file is None:
-            c._classifier_file = Path(path).name
-            c._classifier_hash = hash_file(Path(path))
-            c._classifier_source = "pickle"
-
+        classifier = super().from_pickle(path)
         logger.info("MultiClassClassifier loaded from %s", path)
-        return c
+        return classifier
 
     @classmethod
     def from_training_file(
@@ -512,12 +322,12 @@ def leave_one_group_out(
         labels: npt.NDArray,
         groups: npt.NDArray,
     ) -> Generator[dict, None, None]:
-        """Generate leave-one-group-out splits for multi-class cross-validation.
+        """Yield leave-one-group-out splits for multi-class cross-validation.
 
-        Uses a relaxed acceptance criterion: a split is valid when the test group
-        has at least 2 classes above ``LABEL_THRESHOLD`` and the training portion
-        has all classes above ``LABEL_THRESHOLD``. See the class docstring for
-        rationale.
+        Uses a relaxed acceptance criterion: a split is valid when the test
+        group has at least 2 classes above ``LABEL_THRESHOLD`` and the
+        training portion has all classes above ``LABEL_THRESHOLD``. See the
+        class docstring for rationale.
 
         Args:
             per_frame_features: Per-frame feature DataFrame for labeled data.
@@ -546,7 +356,7 @@ def get_leave_one_group_out_max(
         labels: npt.NDArray,
         groups: npt.NDArray,
     ) -> int:
-        """Count the number of valid LOGO splits for multi-class cross-validation.
+        """Count the number of valid LOGO splits for multi-class CV.
 
         A group is counted as a valid test split when it contains at least 2
         distinct classes above ``LABEL_THRESHOLD`` and the remaining training
@@ -559,120 +369,115 @@ def get_leave_one_group_out_max(
         Returns:
             Number of groups that can serve as a valid test split.
         """
-        all_classes = np.unique(labels)
-        unique_groups = np.unique(groups)
-        count = 0
-        for g in unique_groups:
-            test_mask = np.asarray(groups) == g
-            test_labels = np.asarray(labels)[test_mask]
-            train_labels = np.asarray(labels)[~test_mask]
-
-            n_test_classes = sum(
-                np.count_nonzero(test_labels == cls) >= MultiClassClassifier.LABEL_THRESHOLD
-                for cls in all_classes
-            )
-            train_has_all = all(
-                np.count_nonzero(train_labels == cls) >= MultiClassClassifier.LABEL_THRESHOLD
-                for cls in all_classes
-            )
-            if n_test_classes >= 2 and train_has_all:
-                count += 1
-        return count
+        return classifier_utils.count_valid_logo_splits(
+            labels,
+            groups,
+            label_threshold=MultiClassClassifier.LABEL_THRESHOLD,
+            min_test_classes=2,
+        )
 
     @staticmethod
-    def merge_labels(
-        labels_by_behavior: dict[str, npt.NDArray[np.int8]],
+    def count_label_threshold(
+        counts_by_behavior: dict[str, dict],
         behavior_names: list[str],
-    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]:
-        """Merge per-behavior label arrays into a single multi-class label array.
+        cv_grouping_strategy: CrossValidationGroupingStrategy = DEFAULT_CV_GROUPING_STRATEGY,
+    ) -> int:
+        """Count multi-class LOGO groups that satisfy the relaxed acceptance rule.
 
-        Merging rules:
-            - ``TrackLabels.Label.BEHAVIOR`` in ``MULTICLASS_NONE_BEHAVIOR`` entry
-              → class 0
-            - ``TrackLabels.Label.BEHAVIOR`` in behavior X's entry
-              → class index (1-based, by position in ``behavior_names``)
-            - All other frames → excluded (not in the returned mask)
+        A group is counted as a valid test split when it contains at least 2
+        distinct classes above ``LABEL_THRESHOLD`` and the remaining training
+        groups collectively contain all classes above ``LABEL_THRESHOLD``.
 
         Args:
-            labels_by_behavior: dict mapping behavior name to a label array of
-                ``TrackLabels.Label`` integer values, one element per frame.
-            behavior_names: Ordered list of N behavior names (must not include
-                ``MULTICLASS_NONE_BEHAVIOR``).
+            counts_by_behavior: Maps each class name to its labeled-frame count
+                dict (the structure returned by ``Project.counts(name)``),
+                shaped as ``dict[video_name][identity]`` of fragmented and
+                unfragmented frame/bout count tuples.
+            behavior_names: Ordered class names whose counts appear in
+                ``counts_by_behavior``. Typically includes
+                ``MULTICLASS_NONE_BEHAVIOR``.
+            cv_grouping_strategy: Cross-validation grouping strategy.
 
         Returns:
-            Tuple of ``(multiclass_labels, include_mask)`` where:
+            Number of groups that can serve as a valid multi-class LOGO test split.
 
-            - ``multiclass_labels``: integer array of class indices (0..N) for
-              the included frames only, length M ≤ n_frames.
-            - ``include_mask``: boolean array of length n_frames; True where the
-              frame is included in training.
+        Note:
+            Uses "fragmented" label counts since these reflect labels usable
+            for training.
         """
-        if not labels_by_behavior:
-            raise ValueError("labels_by_behavior must not be empty")
-
-        n_frames = next(iter(labels_by_behavior.values())).shape[0]
-
-        for name, arr in labels_by_behavior.items():
-            if arr.ndim != 1:
-                raise ValueError(f"Label array for '{name}' must be 1-D, got shape {arr.shape}")
-            if arr.shape[0] != n_frames:
-                raise ValueError(
-                    f"Label array for '{name}' has length {arr.shape[0]}, expected {n_frames}"
-                )
-
-        # Check for frames labeled BEHAVIOR in more than one behavior.
-        all_names = [MULTICLASS_NONE_BEHAVIOR, *behavior_names]
-        behavior_mask = np.zeros(n_frames, dtype=np.intp)
-        for name in all_names:
-            if name in labels_by_behavior:
-                behavior_mask += (labels_by_behavior[name] == TrackLabels.Label.BEHAVIOR).astype(
-                    np.intp
-                )
-        conflict_frames = np.where(behavior_mask > 1)[0]
-        if len(conflict_frames) > 0:
-            raise ValueError(
-                f"Conflicting BEHAVIOR labels found on {len(conflict_frames)} frame(s): "
-                f"{conflict_frames.tolist()}. Each frame may be labeled for at "
-                f"most one behavior."
+        if not behavior_names:
+            return 0
+
+        threshold = MultiClassClassifier.LABEL_THRESHOLD
+        group_class_counts: dict[tuple[str, int] | str, dict[str, int]] = {}
+        for behavior_name in behavior_names:
+            behavior_counts = counts_by_behavior.get(behavior_name, {})
+            for video_name, video_counts in behavior_counts.items():
+                if cv_grouping_strategy == CrossValidationGroupingStrategy.VIDEO:
+                    key: tuple[str, int] | str = video_name
+                    group_entry = group_class_counts.setdefault(key, {})
+                    group_entry[behavior_name] = group_entry.get(behavior_name, 0) + sum(
+                        identity_counts["fragmented_frame_counts"][0]
+                        for identity_counts in video_counts.values()
+                    )
+                else:
+                    for identity, identity_counts in video_counts.items():
+                        key = (video_name, int(identity))
+                        group_entry = group_class_counts.setdefault(key, {})
+                        group_entry[behavior_name] = identity_counts["fragmented_frame_counts"][0]
+
+        if not group_class_counts:
+            return 0
+
+        total_by_class = {
+            class_name: sum(
+                group_counts.get(class_name, 0) for group_counts in group_class_counts.values()
             )
+            for class_name in behavior_names
+        }
 
-        class_indices = np.full(n_frames, -1, dtype=np.intp)
-
-        # MULTICLASS_NONE_BEHAVIOR BEHAVIOR frames → class 0
-        if MULTICLASS_NONE_BEHAVIOR in labels_by_behavior:
-            none_arr = labels_by_behavior[MULTICLASS_NONE_BEHAVIOR]
-            class_indices[none_arr == TrackLabels.Label.BEHAVIOR] = 0
-
-        # Named behavior BEHAVIOR frames → class 1..N
-        for i, behavior in enumerate(behavior_names, start=1):
-            if behavior in labels_by_behavior:
-                beh_arr = labels_by_behavior[behavior]
-                class_indices[beh_arr == TrackLabels.Label.BEHAVIOR] = i
-
-        include_mask = class_indices >= 0
-        return class_indices[include_mask], include_mask
-
-    def _create_classifier(self, random_seed: int | None = None) -> typing.Any:
-        """Instantiate the underlying sklearn/xgboost/catboost classifier."""
-        try:
-            factory = _CLASSIFIER_FACTORIES[self._classifier_type]
-        except KeyError:
-            raise ValueError(f"Unsupported classifier type: {self._classifier_type!r}") from None
-        return factory(self._n_jobs, random_seed)
-
-    def _get_features_to_classify(self, features: pd.DataFrame) -> pd.DataFrame:
-        """Reorder/select feature columns to match the trained model's expectations."""
-        if self._classifier_type == ClassifierType.XGBOOST:
-            classifier_columns = self._classifier.get_booster().feature_names
-        elif hasattr(self._classifier, "feature_names_in_"):
-            classifier_columns = list(self._classifier.feature_names_in_)
-        elif hasattr(self._classifier, "feature_names_"):
-            classifier_columns = list(self._classifier.feature_names_)
-        else:
-            raise RuntimeError("Error obtaining feature names from classifier.")
-        return features[classifier_columns]
+        valid_groups = 0
+        for group_counts in group_class_counts.values():
+            n_test_classes = sum(
+                group_counts.get(class_name, 0) >= threshold for class_name in behavior_names
+            )
+            train_has_all_classes = all(
+                (total_by_class[class_name] - group_counts.get(class_name, 0)) >= threshold
+                for class_name in behavior_names
+            )
+            if n_test_classes >= 2 and train_has_all_classes:
+                valid_groups += 1
+
+        return valid_groups
 
     @staticmethod
-    def _supported_classifier_choices() -> set[ClassifierType]:
-        """Determine supported classifier types in the current environment."""
-        return set(_CLASSIFIER_FACTORIES.keys())
+    def label_threshold_met(
+        counts_by_behavior: dict[str, dict],
+        behavior_names: list[str],
+        min_groups: int,
+        cv_grouping_strategy: CrossValidationGroupingStrategy = DEFAULT_CV_GROUPING_STRATEGY,
+    ) -> bool:
+        """Determine whether multi-class labels support ``min_groups`` LOGO splits.
+
+        Args:
+            counts_by_behavior: Maps each class name to its labeled-frame count
+                dict (see :meth:`count_label_threshold`).
+            behavior_names: Ordered class names whose counts appear in
+                ``counts_by_behavior``. Returns ``False`` when fewer than two
+                class names are supplied.
+            min_groups: Minimum number of valid LOGO splits required. Floored
+                at 1, since multi-class training requires at least one valid
+                split.
+            cv_grouping_strategy: Cross-validation grouping strategy.
+
+        Returns:
+            True if the count of valid splits meets ``max(1, min_groups)``.
+        """
+        if len(behavior_names) < 2:
+            return False
+        valid_splits = MultiClassClassifier.count_label_threshold(
+            counts_by_behavior=counts_by_behavior,
+            behavior_names=behavior_names,
+            cv_grouping_strategy=cv_grouping_strategy,
+        )
+        return valid_splits >= max(1, min_groups)
diff --git a/src/jabs/classifier/training_report.py b/src/jabs/classifier/training_report.py
index aae312c8..b14dd60f 100644
--- a/src/jabs/classifier/training_report.py
+++ b/src/jabs/classifier/training_report.py
@@ -13,30 +13,17 @@
 
 @dataclass
 class CrossValidationResult:
-    """Results from a single cross-validation iteration.
+    """Common fields shared by binary and multi-class CV iteration results.
 
     Attributes:
-        iteration: The iteration number (1-indexed)
-        test_label: Label of the test grouping (e.g., video filename and possibly identity index)
-        accuracy: Classification accuracy (0.0 to 1.0)
-        precision_behavior: Precision for behavior class (binary mode)
-        precision_not_behavior: Precision for not-behavior class (binary mode)
-        recall_behavior: Recall for behavior class (binary mode)
-        recall_not_behavior: Recall for not-behavior class (binary mode)
-        f1_behavior: F1 score for behavior class (binary mode)
-        support_behavior: Number of behavior frames in test set (binary mode)
-        support_not_behavior: Number of not-behavior frames in test set (binary mode)
-        class_names: Ordered class names for multiclass mode
-        class_support: Per-class support values for multiclass mode
-        precision_macro: Macro precision for multiclass mode
-        recall_macro: Macro recall for multiclass mode
-        f1_macro: Macro F1 for multiclass mode
-        precision_micro: Micro precision for multiclass mode
-        recall_micro: Micro recall for multiclass mode
-        f1_micro: Micro F1 for multiclass mode
-        per_class_metrics: Per-class metric records for multiclass mode
-        confusion_matrix: 2x2 confusion matrix
-        top_features: List of (feature_name, importance) tuples for this iteration
+        iteration: The iteration number (1-indexed).
+        test_label: Label of the test grouping (e.g., video filename and
+            possibly identity index).
+        accuracy: Classification accuracy (0.0 to 1.0).
+        confusion_matrix: Confusion matrix for this iteration. Shape ``(2, 2)``
+            for binary mode, ``(n_classes, n_classes)`` for multi-class.
+        top_features: List of ``(feature_name, importance)`` tuples for this
+            iteration.
     """
 
     iteration: int
@@ -44,22 +31,56 @@ class CrossValidationResult:
     accuracy: float
     confusion_matrix: np.ndarray
     top_features: list[tuple[str, float]] = field(default_factory=list)
-    precision_behavior: float | None = None
-    precision_not_behavior: float | None = None
-    recall_behavior: float | None = None
-    recall_not_behavior: float | None = None
-    f1_behavior: float | None = None
-    support_behavior: int | None = None
-    support_not_behavior: int | None = None
-    class_names: list[str] | None = None
-    class_support: list[int] | None = None
-    precision_macro: float | None = None
-    recall_macro: float | None = None
-    f1_macro: float | None = None
-    precision_micro: float | None = None
-    recall_micro: float | None = None
-    f1_micro: float | None = None
-    per_class_metrics: list[dict[str, float | int | str]] | None = None
+
+
+@dataclass
+class BinaryCVResult(CrossValidationResult):
+    """Binary cross-validation iteration result.
+
+    Attributes:
+        precision_behavior: Precision for the behavior class.
+        precision_not_behavior: Precision for the not-behavior class.
+        recall_behavior: Recall for the behavior class.
+        recall_not_behavior: Recall for the not-behavior class.
+        f1_behavior: F1 score for the behavior class.
+        support_behavior: Number of behavior frames in the test set.
+        support_not_behavior: Number of not-behavior frames in the test set.
+    """
+
+    precision_behavior: float = 0.0
+    precision_not_behavior: float = 0.0
+    recall_behavior: float = 0.0
+    recall_not_behavior: float = 0.0
+    f1_behavior: float = 0.0
+    support_behavior: int = 0
+    support_not_behavior: int = 0
+
+
+@dataclass
+class MultiClassCVResult(CrossValidationResult):
+    """Multi-class cross-validation iteration result.
+
+    Attributes:
+        class_names: Ordered class names (e.g., ``["None", "Walk", "Run"]``).
+        class_support: Per-class support values in the test set.
+        precision_macro: Macro-averaged precision.
+        recall_macro: Macro-averaged recall.
+        f1_macro: Macro-averaged F1 score.
+        precision_micro: Micro-averaged precision.
+        recall_micro: Micro-averaged recall.
+        f1_micro: Micro-averaged F1 score.
+        per_class_metrics: Per-class metric records.
+    """
+
+    class_names: list[str] = field(default_factory=list)
+    class_support: list[int] = field(default_factory=list)
+    precision_macro: float = 0.0
+    recall_macro: float = 0.0
+    f1_macro: float = 0.0
+    precision_micro: float = 0.0
+    recall_micro: float = 0.0
+    f1_micro: float = 0.0
+    per_class_metrics: list[dict[str, float | int | str]] = field(default_factory=list)
 
 
 @dataclass
@@ -67,23 +88,23 @@ class TrainingReportData:
     """Complete training information for generating a report.
 
     Attributes:
-        behavior_name: Name of the behavior being trained
-        classifier_type: Type/name of the classifier (e.g., "Random Forest")
-        window_size: Window size used for feature extraction
-        balance_training_labels: Whether training labels were balanced
-        symmetric_behavior: Whether the behavior is symmetric
-        distance_unit: Unit used for distance features ("cm" or "pixel")
-        cv_results: List of CrossValidationResult objects, one per iteration
-        final_top_features: Top features from final model (trained on all data)
-        frames_behavior: Total number of frames labeled as behavior (binary mode)
-        frames_not_behavior: Total number of frames labeled as not behavior (binary mode)
-        bouts_behavior: Total number of behavior bouts labeled (binary mode)
-        bouts_not_behavior: Total number of not-behavior bouts labeled (binary mode)
-        class_frame_counts: Optional per-class frame counts (multiclass mode)
-        class_bout_counts: Optional per-class bout counts (multiclass mode)
-        training_time_ms: Total training time in milliseconds
-        timestamp: Datetime when training was completed
-        cv_grouping_strategy: Strategy used for cross-validation grouping
+        behavior_name: Name of the behavior being trained.
+        classifier_type: Type/name of the classifier (e.g., "Random Forest").
+        window_size: Window size used for feature extraction.
+        balance_training_labels: Whether training labels were balanced.
+        symmetric_behavior: Whether the behavior is symmetric.
+        distance_unit: Unit used for distance features ("cm" or "pixel").
+        cv_results: List of cross-validation iteration results.
+        final_top_features: Top features from final model (trained on all data).
+        frames_behavior: Total behavior frames (binary mode).
+        frames_not_behavior: Total not-behavior frames (binary mode).
+        bouts_behavior: Total behavior bouts (binary mode).
+        bouts_not_behavior: Total not-behavior bouts (binary mode).
+        class_frame_counts: Per-class frame counts (multi-class mode).
+        class_bout_counts: Per-class bout counts (multi-class mode).
+        training_time_ms: Total training time in milliseconds.
+        timestamp: Datetime when training was completed.
+        cv_grouping_strategy: Strategy used for cross-validation grouping.
     """
 
     behavior_name: str
@@ -106,33 +127,132 @@ class TrainingReportData:
 
 
 def _escape_markdown(text: str) -> str:
-    """Escape markdown special characters in text.
-
-    Args:
-        text: Text that may contain markdown special characters
-
-    Returns:
-        Text with markdown special characters escaped
-    """
-    # Escape common markdown characters that might appear in filenames
-    # Most important: _ (underscore) which creates italics
-    # Also escape: * (asterisk), [ ] (brackets), ( ) (parentheses)
+    """Escape markdown special characters that might appear in identifiers."""
     chars_to_escape = ["_", "*", "[", "]", "(", ")", "`", "#"]
     for char in chars_to_escape:
         text = text.replace(char, f"\\{char}")
     return text
 
 
+def _is_multiclass_cv(cv_results: list[CrossValidationResult]) -> bool:
+    """Return True if the CV result list belongs to multi-class mode."""
+    return bool(cv_results) and isinstance(cv_results[0], MultiClassCVResult)
+
+
+def _format_label_counts(data: TrainingReportData) -> list[str]:
+    """Return markdown lines for the label-counts section."""
+    lines: list[str] = []
+    if data.class_frame_counts is not None:
+        for name, count in data.class_frame_counts.items():
+            lines.append(f"- **{_escape_markdown(name)} frames:** {count:,}")
+        if data.class_bout_counts is not None:
+            for name, count in data.class_bout_counts.items():
+                lines.append(f"- **{_escape_markdown(name)} bouts:** {count:,}")
+    else:
+        lines.append(f"- **Behavior frames:** {data.frames_behavior:,}")
+        lines.append(f"- **Not-behavior frames:** {data.frames_not_behavior:,}")
+        lines.append(f"- **Behavior bouts:** {data.bouts_behavior:,}")
+        lines.append(f"- **Not-behavior bouts:** {data.bouts_not_behavior:,}")
+    return lines
+
+
+def _format_performance_summary(cv_results: list[CrossValidationResult]) -> list[str]:
+    """Return markdown lines summarizing accuracy and F1 across iterations."""
+    accuracies = [r.accuracy for r in cv_results]
+    lines = [f"- **Mean Accuracy:** {np.mean(accuracies):.4f} (± {np.std(accuracies):.4f})"]
+    if _is_multiclass_cv(cv_results):
+        f1_macro = [r.f1_macro for r in cv_results if isinstance(r, MultiClassCVResult)]
+        f1_micro = [r.f1_micro for r in cv_results if isinstance(r, MultiClassCVResult)]
+        if f1_macro:
+            lines.append(
+                f"- **Mean F1 Score (Macro):** {np.mean(f1_macro):.4f} (± {np.std(f1_macro):.4f})"
+            )
+        if f1_micro:
+            lines.append(
+                f"- **Mean F1 Score (Micro):** {np.mean(f1_micro):.4f} (± {np.std(f1_micro):.4f})"
+            )
+    else:
+        f1_behavior = [r.f1_behavior for r in cv_results if isinstance(r, BinaryCVResult)]
+        if f1_behavior:
+            lines.append(
+                f"- **Mean F1 Score (Behavior):** {np.mean(f1_behavior):.4f} "
+                f"(± {np.std(f1_behavior):.4f})"
+            )
+    return lines
+
+
+def _binary_iteration_row(result: BinaryCVResult) -> list[str | int]:
+    """Return a single iteration row for the binary CV table."""
+    return [
+        result.iteration,
+        f"{result.accuracy:.4f}",
+        f"{result.precision_not_behavior:.4f}",
+        f"{result.precision_behavior:.4f}",
+        f"{result.recall_not_behavior:.4f}",
+        f"{result.recall_behavior:.4f}",
+        f"{result.f1_behavior:.4f}",
+        _escape_markdown(result.test_label),
+    ]
+
+
+def _multiclass_iteration_row(result: MultiClassCVResult) -> list[str | int]:
+    """Return a single iteration row for the multi-class CV table."""
+    return [
+        result.iteration,
+        f"{result.accuracy:.4f}",
+        f"{result.precision_macro:.4f}",
+        f"{result.recall_macro:.4f}",
+        f"{result.f1_macro:.4f}",
+        f"{result.f1_micro:.4f}",
+        _escape_markdown(result.test_label),
+    ]
+
+
+_BINARY_HEADERS = [
+    "Iter",
+    "Accuracy",
+    "Precision (Not Behavior)",
+    "Precision (Behavior)",
+    "Recall (Not Behavior)",
+    "Recall (Behavior)",
+    "F1 Score",
+    "Test Group",
+]
+
+_MULTICLASS_HEADERS = [
+    "Iter",
+    "Accuracy",
+    "Precision (Macro)",
+    "Recall (Macro)",
+    "F1 Score (Macro)",
+    "F1 Score (Micro)",
+    "Test Group",
+]
+
+
+def _format_iteration_table(cv_results: list[CrossValidationResult]) -> str:
+    """Return the markdown iteration-details table."""
+    if _is_multiclass_cv(cv_results):
+        rows = [
+            _multiclass_iteration_row(r) for r in cv_results if isinstance(r, MultiClassCVResult)
+        ]
+        headers = _MULTICLASS_HEADERS
+    else:
+        rows = [_binary_iteration_row(r) for r in cv_results if isinstance(r, BinaryCVResult)]
+        headers = _BINARY_HEADERS
+    return tabulate(rows, headers=headers, tablefmt="github")
+
+
 def generate_markdown_report(data: TrainingReportData) -> str:
     """Generate a markdown-formatted training report.
 
     Args:
-        data: TrainingData object containing all training information
+        data: ``TrainingReportData`` object containing all training information.
 
     Returns:
-        Markdown-formatted string
+        Markdown-formatted string.
     """
-    lines = []
+    lines: list[str] = []
 
     lines.append(f"# Training Report: {data.behavior_name}")
     lines.append("")
@@ -154,129 +274,37 @@ def generate_markdown_report(data: TrainingReportData) -> str:
 
     lines.append("### Label Counts")
     lines.append("")
-    if data.class_frame_counts is not None:
-        for name, count in data.class_frame_counts.items():
-            lines.append(f"- **{_escape_markdown(name)} frames:** {count:,}")
-        if data.class_bout_counts is not None:
-            for name, count in data.class_bout_counts.items():
-                lines.append(f"- **{_escape_markdown(name)} bouts:** {count:,}")
-    else:
-        lines.append(f"- **Behavior frames:** {data.frames_behavior:,}")
-        lines.append(f"- **Not-behavior frames:** {data.frames_not_behavior:,}")
-        lines.append(f"- **Behavior bouts:** {data.bouts_behavior:,}")
-        lines.append(f"- **Not-behavior bouts:** {data.bouts_not_behavior:,}")
+    lines.extend(_format_label_counts(data))
     lines.append("")
 
-    # Cross-validation results
     if data.cv_results:
         lines.append("## Cross-Validation Results")
         lines.append("")
-
-        # Summary statistics
-        accuracies = [r.accuracy for r in data.cv_results]
-        is_multiclass_cv = data.cv_results[0].precision_macro is not None
-
         lines.append("### Performance Summary")
         lines.append("")
-        lines.append(
-            f"- **Mean Accuracy:** {np.mean(accuracies):.4f} (± {np.std(accuracies):.4f})"
-        )
-        if is_multiclass_cv:
-            f1_macro = [r.f1_macro for r in data.cv_results if r.f1_macro is not None]
-            f1_micro = [r.f1_micro for r in data.cv_results if r.f1_micro is not None]
-            if f1_macro:
-                lines.append(
-                    f"- **Mean F1 Score (Macro):** {np.mean(f1_macro):.4f} (± {np.std(f1_macro):.4f})"
-                )
-            if f1_micro:
-                lines.append(
-                    f"- **Mean F1 Score (Micro):** {np.mean(f1_micro):.4f} (± {np.std(f1_micro):.4f})"
-                )
-        else:
-            f1_behavior = [r.f1_behavior for r in data.cv_results if r.f1_behavior is not None]
-            if f1_behavior:
-                lines.append(
-                    f"- **Mean F1 Score (Behavior):** {np.mean(f1_behavior):.4f} (± {np.std(f1_behavior):.4f})"
-                )
+        lines.extend(_format_performance_summary(data.cv_results))
         lines.append("")
 
-        # Detailed results table
         lines.append("### Iteration Details")
         lines.append(f"CV Grouping Strategy: {data.cv_grouping_strategy.value}")
         lines.append("")
-
-        table_data = []
-        if is_multiclass_cv:
-            for result in data.cv_results:
-                escaped_video = _escape_markdown(result.test_label)
-                table_data.append(
-                    [
-                        result.iteration,
-                        f"{result.accuracy:.4f}",
-                        f"{(result.precision_macro if result.precision_macro is not None else 0.0):.4f}",
-                        f"{(result.recall_macro if result.recall_macro is not None else 0.0):.4f}",
-                        f"{(result.f1_macro if result.f1_macro is not None else 0.0):.4f}",
-                        f"{(result.f1_micro if result.f1_micro is not None else 0.0):.4f}",
-                        f"{escaped_video}",
-                    ]
-                )
-
-            headers = [
-                "Iter",
-                "Accuracy",
-                "Precision (Macro)",
-                "Recall (Macro)",
-                "F1 Score (Macro)",
-                "F1 Score (Micro)",
-                "Test Group",
-            ]
-        else:
-            for result in data.cv_results:
-                escaped_video = _escape_markdown(result.test_label)
-                table_data.append(
-                    [
-                        result.iteration,
-                        f"{result.accuracy:.4f}",
-                        f"{(result.precision_not_behavior if result.precision_not_behavior is not None else 0.0):.4f}",
-                        f"{(result.precision_behavior if result.precision_behavior is not None else 0.0):.4f}",
-                        f"{(result.recall_not_behavior if result.recall_not_behavior is not None else 0.0):.4f}",
-                        f"{(result.recall_behavior if result.recall_behavior is not None else 0.0):.4f}",
-                        f"{(result.f1_behavior if result.f1_behavior is not None else 0.0):.4f}",
-                        f"{escaped_video}",
-                    ]
-                )
-
-            headers = [
-                "Iter",
-                "Accuracy",
-                "Precision (Not Behavior)",
-                "Precision (Behavior)",
-                "Recall (Not Behavior)",
-                "Recall (Behavior)",
-                "F1 Score",
-                "Test Group",
-            ]
-
-        table_markdown = tabulate(table_data, headers=headers, tablefmt="github")
-        lines.append(table_markdown)
+        lines.append(_format_iteration_table(data.cv_results))
         lines.append("")
     else:
-        # No cross-validation was performed
         lines.append("## Cross-Validation")
         lines.append("")
         lines.append("*No cross-validation was performed for this training.*")
         lines.append("")
 
-    # Final model feature importance
     lines.append("## Feature Importance")
     lines.append("")
     lines.append("Top 20 features from final model (trained on all labeled data):")
     lines.append("")
 
-    feature_table = []
-    for rank, (feature_name, importance) in enumerate(data.final_top_features, start=1):
-        feature_table.append([rank, _escape_markdown(feature_name), f"{importance:.2f}"])
-
+    feature_table = [
+        [rank, _escape_markdown(feature_name), f"{importance:.2f}"]
+        for rank, (feature_name, importance) in enumerate(data.final_top_features, start=1)
+    ]
     feature_markdown = tabulate(
         feature_table, headers=["Rank", "Feature Name", "Importance"], tablefmt="github"
     )
@@ -299,26 +327,80 @@ def _to_python_type(val):
     return val
 
 
+def _common_cv_dict(result: CrossValidationResult) -> dict:
+    """Return the dict fields common to all CV result types."""
+    return {
+        "iteration": int(result.iteration),
+        "test_label": str(result.test_label),
+        "accuracy": float(result.accuracy),
+        "confusion_matrix": _to_python_type(result.confusion_matrix),
+        "top_features": [
+            {"feature_name": str(name), "importance": float(importance)}
+            for name, importance in result.top_features
+        ],
+    }
+
+
+def _binary_cv_to_dict(result: BinaryCVResult) -> dict:
+    """Convert a binary CV result to its JSON-serializable dict."""
+    payload = _common_cv_dict(result)
+    payload.update(
+        {
+            "precision_behavior": float(result.precision_behavior),
+            "precision_not_behavior": float(result.precision_not_behavior),
+            "recall_behavior": float(result.recall_behavior),
+            "recall_not_behavior": float(result.recall_not_behavior),
+            "f1_behavior": float(result.f1_behavior),
+            "support_behavior": int(result.support_behavior),
+            "support_not_behavior": int(result.support_not_behavior),
+        }
+    )
+    return payload
+
+
+def _multiclass_cv_to_dict(result: MultiClassCVResult) -> dict:
+    """Convert a multi-class CV result to its JSON-serializable dict."""
+    payload = _common_cv_dict(result)
+    payload.update(
+        {
+            "class_names": [str(name) for name in result.class_names],
+            "class_support": [int(v) for v in result.class_support],
+            "precision_macro": float(result.precision_macro),
+            "recall_macro": float(result.recall_macro),
+            "f1_macro": float(result.f1_macro),
+            "precision_micro": float(result.precision_micro),
+            "recall_micro": float(result.recall_micro),
+            "f1_micro": float(result.f1_micro),
+            "per_class_metrics": _to_python_type(result.per_class_metrics),
+        }
+    )
+    return payload
+
+
+def _cv_result_to_dict(result: CrossValidationResult) -> dict:
+    """Dispatch a CV result to its type-specific dict converter."""
+    if isinstance(result, MultiClassCVResult):
+        return _multiclass_cv_to_dict(result)
+    if isinstance(result, BinaryCVResult):
+        return _binary_cv_to_dict(result)
+    return _common_cv_dict(result)
+
+
 def generate_json_report(data: TrainingReportData) -> dict:
     """Generate a JSON-serializable training report.
 
     Ensures all numpy types are converted to native Python types.
 
     Args:
-        data: TrainingReportData object containing all training information
+        data: ``TrainingReportData`` object containing all training information.
 
     Returns:
-        Dictionary suitable for JSON serialization
+        Dictionary suitable for JSON serialization.
     """
-    # Convert timestamp to UTC and use ISO format with 'Z' suffix
-    if data.timestamp.tzinfo is None:
-        # Assume naive datetime is local time; convert to UTC
-        timestamp_utc = data.timestamp.astimezone(timezone.utc)
-    else:
-        timestamp_utc = data.timestamp.astimezone(timezone.utc)
+    timestamp_utc = data.timestamp.astimezone(timezone.utc)
     timestamp_str = timestamp_utc.replace(tzinfo=None).isoformat() + "Z"
 
-    report = {
+    return {
         "behavior_name": data.behavior_name,
         "classifier_type": data.classifier_type,
         "window_size": int(data.window_size),
@@ -342,70 +424,26 @@ def generate_json_report(data: TrainingReportData) -> dict:
             if data.class_bout_counts is not None
             else None
         ),
-        "cv_results": [],
+        "cv_results": [_cv_result_to_dict(r) for r in data.cv_results],
         "final_top_features": [
             {"feature_name": str(name), "importance": float(importance)}
             for name, importance in data.final_top_features
         ],
     }
 
-    for result in data.cv_results:
-        result_dict = {
-            "iteration": int(result.iteration),
-            "test_label": str(result.test_label),
-            "accuracy": float(result.accuracy),
-            "confusion_matrix": _to_python_type(result.confusion_matrix),
-            "top_features": [
-                {"feature_name": str(name), "importance": float(importance)}
-                for name, importance in result.top_features
-            ],
-        }
-        if result.precision_behavior is not None:
-            result_dict["precision_behavior"] = float(result.precision_behavior)
-        if result.precision_not_behavior is not None:
-            result_dict["precision_not_behavior"] = float(result.precision_not_behavior)
-        if result.recall_behavior is not None:
-            result_dict["recall_behavior"] = float(result.recall_behavior)
-        if result.recall_not_behavior is not None:
-            result_dict["recall_not_behavior"] = float(result.recall_not_behavior)
-        if result.f1_behavior is not None:
-            result_dict["f1_behavior"] = float(result.f1_behavior)
-        if result.support_behavior is not None:
-            result_dict["support_behavior"] = int(result.support_behavior)
-        if result.support_not_behavior is not None:
-            result_dict["support_not_behavior"] = int(result.support_not_behavior)
-        if result.class_names is not None:
-            result_dict["class_names"] = [str(name) for name in result.class_names]
-        if result.class_support is not None:
-            result_dict["class_support"] = [int(v) for v in result.class_support]
-        if result.precision_macro is not None:
-            result_dict["precision_macro"] = float(result.precision_macro)
-        if result.recall_macro is not None:
-            result_dict["recall_macro"] = float(result.recall_macro)
-        if result.f1_macro is not None:
-            result_dict["f1_macro"] = float(result.f1_macro)
-        if result.precision_micro is not None:
-            result_dict["precision_micro"] = float(result.precision_micro)
-        if result.recall_micro is not None:
-            result_dict["recall_micro"] = float(result.recall_micro)
-        if result.f1_micro is not None:
-            result_dict["f1_micro"] = float(result.f1_micro)
-        if result.per_class_metrics is not None:
-            result_dict["per_class_metrics"] = _to_python_type(result.per_class_metrics)
-        report["cv_results"].append(result_dict)
-
-    return report
-
 
 def save_training_report(data: TrainingReportData, output_path: Path) -> None:
     """Generate and save a training report.
 
-    The report format is determined by the file extension of output_path.
-    Currently, markdown (.md) and JSON (.json) are supported.
+    The report format is determined by the file extension of ``output_path``.
+    Currently, markdown (``.md``) and JSON (``.json``) are supported.
 
     Args:
-        data: TrainingData object containing all training information
-        output_path: Path where the report file should be saved
+        data: ``TrainingReportData`` object containing all training information.
+        output_path: Path where the report file should be saved.
+
+    Raises:
+        ValueError: If the output path has an unsupported extension.
     """
     if output_path.suffix.lower() == ".md":
         markdown_content = generate_markdown_report(data)
diff --git a/src/jabs/project/parallel_workers.py b/src/jabs/project/parallel_workers.py
index 69db641b..d54ae2c4 100644
--- a/src/jabs/project/parallel_workers.py
+++ b/src/jabs/project/parallel_workers.py
@@ -18,7 +18,7 @@
 
 import jabs.feature_extraction as fe
 from jabs.core.constants import MULTICLASS_NONE_BEHAVIOR
-from jabs.core.enums import CacheFormat, ClassifierMode
+from jabs.core.enums import CacheFormat
 from jabs.pose_estimation import open_pose_file
 from jabs.video_reader import VideoReader
 from jabs.video_reader.utilities import get_fps
@@ -32,12 +32,8 @@
     from jabs.pose_estimation import PoseEstimation
 
 
-class FeatureLoadJobSpec(TypedDict):
-    """Specification of a single video feature extraction job.
-
-    This TypedDict encapsulates all necessary information for loading
-    a single video's features *for labeled frames* in a parallel worker.
-    """
+class _BaseFeatureLoadJobSpec(TypedDict):
+    """Shared fields for binary and multi-class feature-load jobs."""
 
     video: str
     video_path: Path
@@ -46,19 +42,36 @@ class FeatureLoadJobSpec(TypedDict):
     feature_dir: Path
     cache_dir: Path | None
     behavior_settings: dict[str, object]
-    behavior_name: str | None
-    behavior_names: list[str] | None
-    classifier_mode: ClassifierMode
     cache_format: str
 
 
-class CollectFeatureLoadResult(TypedDict):
-    """Result of collecting features from a single video."""
+class BinaryFeatureLoadJobSpec(_BaseFeatureLoadJobSpec):
+    """Single-video feature-load job for binary classification."""
+
+    behavior_name: str | None
+
+
+class MulticlassFeatureLoadJobSpec(_BaseFeatureLoadJobSpec):
+    """Single-video feature-load job for multi-class classification."""
+
+    behavior_names: list[str]
+
+
+class BinaryFeatureResult(TypedDict):
+    """Result of collecting binary-labeled features from a single video."""
 
     per_frame: list[pd.DataFrame]
     window: list[pd.DataFrame]
     labels: list[np.ndarray]
-    labels_by_behavior: list[dict[str, np.ndarray]] | None
+    group_keys: list[tuple[str, int]]
+
+
+class MulticlassFeatureResult(TypedDict):
+    """Result of collecting multi-class-labeled features from a single video."""
+
+    per_frame: list[pd.DataFrame]
+    window: list[pd.DataFrame]
+    labels_by_behavior: list[dict[str, np.ndarray]]
     group_keys: list[tuple[str, int]]
 
 
@@ -125,8 +138,8 @@ def scan_video_metadata(job: VideoScanJobSpec) -> VideoScanResult:
 
     Reads everything :class:`~jabs.project.video_manager.VideoManager` and
     :class:`~jabs.project.feature_manager.FeatureManager` need at project-load
-    time — frame counts, identity count, static objects, lixit keypoints, and
-    distance-unit attributes — in a single ``h5py.File`` open.
+    time - frame counts, identity count, static objects, lixit keypoints, and
+    distance-unit attributes - in a single ``h5py.File`` open.
 
     This function is stateless and picklable so it can be dispatched to
     :class:`~jabs.core.utils.process_pool_manager.ProcessPoolManager` workers.
@@ -186,145 +199,209 @@ def _load_video_labels(annotations_path: Path, pose_est: "PoseEstimation") -> Vi
     return VideoLabels.load(data, pose_est)
 
 
-def collect_labeled_features(job: FeatureLoadJobSpec) -> CollectFeatureLoadResult:
-    """Extracts features for labeled frames for a single video.
+def _apply_macos_fork_lapack_workaround() -> None:
+    """Avoid Accelerate LAPACK segfaults in forked children on macOS.
 
-    This function loads per-frame and window features for a given video. If features
-    are not pre-computed then this will result in features being computed directly
-    from pose. It is intended to be used in parallel in Project.get_labeled_features().
-    Returns features for labeled frame only, features for unlabeled frames are discarded.
+    scipy.linalg.lstsq (called by signal.stft's "linear" detrend) uses Apple's
+    Accelerate LAPACK, which segfaults when invoked from a forked child
+    process. Switch to the pure-numpy detrend path only on macOS to avoid this.
+    The flag is process-local, so the main process and non-macOS workers are
+    unaffected.
+    """
+    if sys.platform == "darwin" and multiprocessing.parent_process() is not None:
+        fe.feature_base_class._use_numpy_detrend = True
 
-    Note: this function is a standalone function to facilitate pickling for parallel
-    processing via ProcessPoolExecutor. It should not rely on any instance-specific
-    state, and is passed all necessary data via the JobSpec argument. A Project instance
-    maintains a pool of workers that call this function in parallel from
-    Project.load_labeled_features() in order to speed up feature extraction across
-    multiple videos.
+
+def _open_pose_and_labels(
+    job: _BaseFeatureLoadJobSpec,
+) -> "tuple[PoseEstimation, VideoLabels | None, float]":
+    """Set up the macOS workaround and open pose + label resources for a job."""
+    _apply_macos_fork_lapack_workaround()
+    pose_est = open_pose_file(job["pose_path"], job["cache_dir"])
+    fps = get_fps(str(job["video_path"]))
+    labels_obj = _load_video_labels(job["annotations_path"], pose_est)
+    return pose_est, labels_obj, fps
+
+
+def _extract_identity_features(
+    video: str,
+    identity: int,
+    pose_est: "PoseEstimation",
+    feature_dir: Path,
+    behavior_settings: dict,
+    cache_format: str,
+    fps: float,
+    labels: np.ndarray,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Compute per-frame and window features for one identity's labeled frames.
 
     Args:
-        job (FeatureLoadJobSpec): Specification of the video and settings for feature extraction.
+        video: Video filename (used to locate cached features).
+        identity: Identity index within the video.
+        pose_est: Open pose estimation object.
+        feature_dir: Project feature directory.
+        behavior_settings: Behavior-scoped settings dict (must include ``window_size``).
+        cache_format: Cache format string from project settings.
+        fps: Video frames per second.
+        labels: Per-frame label vector for this identity; rows with
+            ``TrackLabels.Label.NONE`` are dropped from the output features.
 
     Returns:
-        CollectFeatureLoadResult: Collected per-frame and window features, labels, and
-            identity mapping for the video.
+        Tuple ``(per_frame_df, window_df)`` containing only the labeled rows.
     """
-    video: str = job["video"]
-    video_path = job["video_path"]
-    pose_path = job["pose_path"]
-    annotations_path = job["annotations_path"]
-    feature_dir = job["feature_dir"]
-    cache_dir = job["cache_dir"]
-    behavior_settings: dict = job["behavior_settings"]
-    behavior_name = job["behavior_name"]
-    behavior_names = job["behavior_names"]
-    classifier_mode = job["classifier_mode"]
+    cache_format_enum = CacheFormat(cache_format)
+    features = fe.IdentityFeatures(
+        video,
+        identity,
+        feature_dir,
+        pose_est,
+        fps=fps,
+        op_settings=behavior_settings,
+        cache_format=cache_format_enum,
+    )
+    per_frame = features.get_per_frame_flat(labels)
+    window_size: int = behavior_settings["window_size"]
+    window_features = features.get_window_features(window_size, labels)
+    window_features = fe.IdentityFeatures.merge_window_features(window_features)
+    return pd.DataFrame(per_frame), pd.DataFrame(window_features)
 
-    # On macOS, scipy.linalg.lstsq (called by signal.stft's "linear" detrend)
-    # uses Apple's Accelerate LAPACK, which segfaults when invoked from a
-    # forked child process.  Switch to the pure-numpy detrend path only on
-    # macOS to avoid this.  This flag is process-local so the main process
-    # and non-macOS workers are unaffected.
-    if sys.platform == "darwin" and multiprocessing.parent_process() is not None:
-        fe.feature_base_class._use_numpy_detrend = True
 
-    pose_est = open_pose_file(pose_path, cache_dir)
-    fps = get_fps(str(video_path))
+def collect_binary_labeled_features(job: BinaryFeatureLoadJobSpec) -> BinaryFeatureResult:
+    """Extract per-frame and window features for one video, binary mode.
+
+    For every identity in the video, label arrays are loaded from the requested
+    behavior track. Frames where the identity does not exist are forced to
+    ``TrackLabels.Label.NONE``, and only ``NONE`` frames are dropped before
+    feature extraction - both ``BEHAVIOR`` and ``NOT_BEHAVIOR`` frames for the
+    requested behavior are retained as training rows. Identities with no
+    labeled frames after that filter are skipped entirely.
+
+    This function is stateless and picklable so it can be dispatched to
+    process-pool workers from :class:`~jabs.project.Project`.
 
-    # Get labels for video (might be None)
-    # this loads all labels from the annotations file for any labeled behavior
-    labels_obj = _load_video_labels(annotations_path, pose_est)
+    Args:
+        job: Video and feature-extraction settings.
+
+    Returns:
+        Collected per-frame/window features, label arrays, and per-identity
+        group keys for the video.
+    """
+    pose_est, labels_obj, fps = _open_pose_and_labels(job)
     if labels_obj is None:
-        return {
-            "per_frame": [],
-            "window": [],
-            "labels": [],
-            "labels_by_behavior": None,
-            "group_keys": [],
-        }
+        return {"per_frame": [], "window": [], "labels": [], "group_keys": []}
+
+    video = job["video"]
+    behavior_name = job["behavior_name"]
+    behavior_settings: dict = job["behavior_settings"]
+    feature_dir = job["feature_dir"]
+    cache_format = job["cache_format"]
 
     per_frame_list: list[pd.DataFrame] = []
     window_list: list[pd.DataFrame] = []
     labels_list: list[np.ndarray] = []
-    labels_by_behavior_list: list[dict[str, np.ndarray]] = []
     group_keys: list[tuple[str, int]] = []
 
     for identity in pose_est.identities:
         identity_mask = pose_est.identity_mask(identity).astype(bool)
-        labels = None
-        labels_by_behavior = None
-
-        if classifier_mode == ClassifierMode.MULTICLASS:
-            if behavior_names is None:
-                raise ValueError("behavior_names is required for multiclass feature collection")
-
-            behavior_tracks = [MULTICLASS_NONE_BEHAVIOR, *behavior_names]
-            labels_by_behavior = {}
-            include_mask = np.zeros(identity_mask.shape, dtype=bool)
-
-            for behavior_key in behavior_tracks:
-                behavior_labels = (
-                    labels_obj.get_track_labels(str(identity), behavior_key).get_labels().copy()
-                )
-                behavior_labels[~identity_mask] = TrackLabels.Label.NONE
-                labels_by_behavior[behavior_key] = behavior_labels
-                include_mask |= behavior_labels == TrackLabels.Label.BEHAVIOR
-
-            # Include only frames with explicit BEHAVIOR labels in any class.
-            labels = np.full(identity_mask.shape, TrackLabels.Label.NONE, dtype=np.int8)
-            labels[include_mask] = TrackLabels.Label.BEHAVIOR
-        else:
-            # Extract labels for this (video, identity) pair for the specified behavior
-            labels = labels_obj.get_track_labels(str(identity), behavior_name).get_labels()
-
-            # Exclude frames where identity does not exist
-            # NOTE: in the future we might want to handle this differently, since we can still predict
-            # behavior even when the identity is not detected in a frame (e.g., occluded) due to
-            # temporal context from surrounding frames provided by window features
-            labels[~identity_mask] = TrackLabels.Label.NONE
-
-        # Skip identities without any included labels
+        labels = labels_obj.get_track_labels(str(identity), behavior_name).get_labels()
+        # Exclude frames where the identity does not exist.
+        # NOTE: in the future we might want to handle this differently, since we
+        # can still predict behavior even when the identity is not detected in
+        # a frame (e.g., occluded) thanks to window features.
+        labels[~identity_mask] = TrackLabels.Label.NONE
+
         if (labels != TrackLabels.Label.NONE).sum() == 0:
             continue
 
-        # Feature extraction for this identity
-        cache_format = CacheFormat(job["cache_format"])
-        features = fe.IdentityFeatures(
-            video,
-            identity,
-            feature_dir,
-            pose_est,
-            fps=fps,
-            op_settings=behavior_settings,
-            cache_format=cache_format,
+        per_frame_df, window_df = _extract_identity_features(
+            video, identity, pose_est, feature_dir, behavior_settings, cache_format, fps, labels
         )
+        per_frame_list.append(per_frame_df)
+        window_list.append(window_df)
+        labels_list.append(labels[labels != TrackLabels.Label.NONE])
+        group_keys.append((video, int(identity)))
 
-        # Per-frame features
-        per_frame = features.get_per_frame_flat(labels)
+    return {
+        "per_frame": per_frame_list,
+        "window": window_list,
+        "labels": labels_list,
+        "group_keys": group_keys,
+    }
 
-        # Window features
-        window_size: int = behavior_settings["window_size"]
-        window_features = features.get_window_features(window_size, labels)
-        window_features = fe.IdentityFeatures.merge_window_features(window_features)
 
-        # Keep only labeled frames
-        per_frame_list.append(pd.DataFrame(per_frame))
-        window_list.append(pd.DataFrame(window_features))
-        labels_list.append(labels[labels != TrackLabels.Label.NONE])
-        if labels_by_behavior is not None:
-            labels_by_behavior_list.append(
-                {
-                    key: arr[labels != TrackLabels.Label.NONE]
-                    for key, arr in labels_by_behavior.items()
-                }
+def collect_multiclass_labeled_features(
+    job: MulticlassFeatureLoadJobSpec,
+) -> MulticlassFeatureResult:
+    """Extract per-frame and window features for one video, multi-class mode.
+
+    Frames are included only when they have an explicit
+    ``TrackLabels.Label.BEHAVIOR`` label in at least one class track (including
+    the reserved ``MULTICLASS_NONE_BEHAVIOR`` background track). Identities
+    with no labeled frames are skipped entirely.
+
+    This function is stateless and picklable so it can be dispatched to
+    process-pool workers from :class:`~jabs.project.Project`.
+
+    Args:
+        job: Video and feature-extraction settings (must include
+            ``behavior_names``).
+
+    Returns:
+        Collected per-frame/window features, per-behavior label arrays, and
+        per-identity group keys for the video.
+    """
+    pose_est, labels_obj, fps = _open_pose_and_labels(job)
+    if labels_obj is None:
+        return {"per_frame": [], "window": [], "labels_by_behavior": [], "group_keys": []}
+
+    behavior_names = job["behavior_names"]
+    if not behavior_names:
+        raise ValueError("behavior_names is required for multiclass feature collection")
+
+    video = job["video"]
+    behavior_settings: dict = job["behavior_settings"]
+    feature_dir = job["feature_dir"]
+    cache_format = job["cache_format"]
+    behavior_tracks = [MULTICLASS_NONE_BEHAVIOR, *behavior_names]
+
+    per_frame_list: list[pd.DataFrame] = []
+    window_list: list[pd.DataFrame] = []
+    labels_by_behavior_list: list[dict[str, np.ndarray]] = []
+    group_keys: list[tuple[str, int]] = []
+
+    for identity in pose_est.identities:
+        identity_mask = pose_est.identity_mask(identity).astype(bool)
+
+        labels_by_behavior: dict[str, np.ndarray] = {}
+        include_mask = np.zeros(identity_mask.shape, dtype=bool)
+        for behavior_key in behavior_tracks:
+            behavior_labels = (
+                labels_obj.get_track_labels(str(identity), behavior_key).get_labels().copy()
             )
+            behavior_labels[~identity_mask] = TrackLabels.Label.NONE
+            labels_by_behavior[behavior_key] = behavior_labels
+            include_mask |= behavior_labels == TrackLabels.Label.BEHAVIOR
+
+        # Include only frames with explicit BEHAVIOR labels in any class.
+        labels = np.full(identity_mask.shape, TrackLabels.Label.NONE, dtype=np.int8)
+        labels[include_mask] = TrackLabels.Label.BEHAVIOR
+
+        if (labels != TrackLabels.Label.NONE).sum() == 0:
+            continue
+
+        per_frame_df, window_df = _extract_identity_features(
+            video, identity, pose_est, feature_dir, behavior_settings, cache_format, fps, labels
+        )
+        per_frame_list.append(per_frame_df)
+        window_list.append(window_df)
+        labels_by_behavior_list.append(
+            {key: arr[labels != TrackLabels.Label.NONE] for key, arr in labels_by_behavior.items()}
+        )
         group_keys.append((video, int(identity)))
 
     return {
         "per_frame": per_frame_list,
         "window": window_list,
-        "labels": labels_list,
-        "labels_by_behavior": (
-            labels_by_behavior_list if classifier_mode == ClassifierMode.MULTICLASS else None
-        ),
+        "labels_by_behavior": labels_by_behavior_list,
         "group_keys": group_keys,
     }
diff --git a/src/jabs/project/prediction_manager.py b/src/jabs/project/prediction_manager.py
index 595bcb39..abfef0cc 100644
--- a/src/jabs/project/prediction_manager.py
+++ b/src/jabs/project/prediction_manager.py
@@ -12,7 +12,7 @@
 MULTICLASS_PREDICTION_KEY = "__multiclass__"
 
 if typing.TYPE_CHECKING:
-    from jabs.classifier import Classifier
+    from jabs.classifier import Classifier, MultiClassClassifier
     from jabs.pose_estimation import PoseEstimation
 
     from .project import Project
@@ -32,9 +32,6 @@ class PredictionManager:
 
     """
 
-    _PREDICTION_FILE_VERSION = 2
-    MULTICLASS_PREDICTION_KEY = MULTICLASS_PREDICTION_KEY
-
     def __init__(self, project: "Project"):
         """Initialize the PredictionManager with a project.
 
@@ -51,7 +48,7 @@ def write_predictions(
         predictions: np.ndarray,
         probabilities: np.ndarray,
         poses: "PoseEstimation",
-        classifier: "Classifier",
+        classifier: "Classifier | MultiClassClassifier",
         postprocessed_predictions: np.ndarray | None = None,
         class_names: list[str] | None = None,
     ) -> None:
@@ -69,7 +66,7 @@ def write_predictions(
                 (n_animals, n_frames); multi-class predictions use shape
                 (n_animals, n_frames, n_classes).
             poses: PoseEstimation object corresponding to the video.
-            classifier: Classifier object used to generate predictions.
+            classifier: Binary or multi-class classifier instance used to generate predictions.
             postprocessed_predictions (np.ndarray | None): Optional array of post-processed predictions.
             class_names (list[str] | None): Optional ordered class names for multi-class predictions.
 
@@ -131,7 +128,7 @@ def load_multiclass_predictions(
             names. Missing predictions return empty dicts and ``None`` for
             class names.
         """
-        return self._load_prediction_record(video, self.MULTICLASS_PREDICTION_KEY)
+        return self._load_prediction_record(video, MULTICLASS_PREDICTION_KEY)
 
     def _load_prediction_record(
         self, video: str, behavior: str
diff --git a/src/jabs/project/project.py b/src/jabs/project/project.py
index 766a5396..c7230b94 100644
--- a/src/jabs/project/project.py
+++ b/src/jabs/project/project.py
@@ -33,10 +33,12 @@
 
 from .feature_manager import FeatureManager
 from .parallel_workers import (
-    FeatureLoadJobSpec,
+    BinaryFeatureLoadJobSpec,
+    MulticlassFeatureLoadJobSpec,
     VideoScanJobSpec,
     VideoScanResult,
-    collect_labeled_features,
+    collect_binary_labeled_features,
+    collect_multiclass_labeled_features,
     scan_video_metadata,
 )
 from .prediction_manager import PredictionManager
@@ -669,7 +671,7 @@ def get_overlapping_behavior_label_videos(self) -> list[str]:
         Scans every video in the project for annotation conflicts where a single
         identity has the same frame labeled BEHAVIOR for two or more behaviors
         simultaneously. Includes the reserved "None" behavior track, consistent
-        with how MultiClassClassifier.merge_labels() detects conflicts at training time.
+        with how classifier_utils.merge_labels() detects conflicts at training time.
 
         Returns:
             Sorted list of video filenames containing at least one overlap.
@@ -796,6 +798,132 @@ def counts(self, behavior):
             counts[video] = self.load_counts(video, behavior)
         return counts
 
+    def _build_feature_load_job_base(self, video: str, behavior_settings: dict) -> dict:
+        """Construct the per-video fields shared by every feature-load job spec."""
+        return {
+            "video": video,
+            "video_path": self._video_manager.video_path(video),
+            "pose_path": self._video_manager.get_cached_pose_path(video),
+            "annotations_path": self._paths.annotations_dir / Path(video).with_suffix(".json"),
+            "feature_dir": self.feature_dir,
+            "cache_dir": self._paths.cache_dir,
+            "behavior_settings": behavior_settings,
+            "cache_format": self.cache_format.value,
+        }
+
+    def _collect_features_parallel(
+        self,
+        jobs: list[dict],
+        worker_fn: Callable[[dict], dict],
+        progress_callable: Callable[[], None] | None,
+        should_terminate_callable: Callable[[], None] | None,
+    ) -> dict[str, dict]:
+        """Run feature-collection jobs in parallel (or single-threaded fallback).
+
+        Args:
+            jobs: One job spec per video.
+            worker_fn: Worker function to apply to each job spec.
+            progress_callable: Called once per completed video, if provided.
+            should_terminate_callable: Called between submissions and completions,
+                if provided; should raise on user-requested cancellation.
+
+        Returns:
+            Dict keyed by video name. Callers reorder by their canonical
+            ``videos`` list for deterministic concatenation.
+        """
+        executor = self._process_pool
+        results_by_video: dict[str, dict] = {}
+
+        if executor is not None:
+            future_to_video = {executor.submit(worker_fn, job): job["video"] for job in jobs}
+            for future in as_completed(future_to_video):
+                if should_terminate_callable:
+                    should_terminate_callable()
+                video_name = future_to_video[future]
+                try:
+                    res = future.result()
+                except Exception as e:
+                    raise RuntimeError(f"Feature collection failed for video: {video_name}") from e
+                results_by_video[video_name] = res
+                if progress_callable:
+                    progress_callable()
+        else:
+            for job in jobs:
+                if should_terminate_callable:
+                    should_terminate_callable()
+                try:
+                    res = worker_fn(job)
+                except Exception as e:
+                    raise RuntimeError(
+                        f"Feature collection failed for video: {job['video']}"
+                    ) from e
+                results_by_video[job["video"]] = res
+                if progress_callable:
+                    progress_callable()
+
+        return results_by_video
+
+    @staticmethod
+    def _assign_cv_group_ids(
+        all_group_keys: list[tuple[str, int]],
+        videos: list[str],
+        grouping_strategy: CrossValidationGroupingStrategy,
+    ) -> tuple[dict[tuple[str, int], int], dict[int, dict]]:
+        """Assign deterministic cross-validation group ids.
+
+        Args:
+            all_group_keys: ``(video, identity)`` tuples in row order.
+            videos: Canonical list of project videos; ids are assigned in this order.
+            grouping_strategy: ``INDIVIDUAL`` groups one (video, identity) pair per
+                gid; ``VIDEO`` groups all identities of a video together.
+
+        Returns:
+            Tuple of ``(key_to_gid, group_mapping)`` where ``key_to_gid`` maps each
+            ``(video, identity)`` pair to its group id and ``group_mapping`` maps
+            each group id back to ``{"video": ..., "identity": ...}``.
+        """
+        key_to_gid: dict[tuple[str, int], int] = {}
+        group_mapping: dict[int, dict] = {}
+        gid = 0
+        if grouping_strategy == CrossValidationGroupingStrategy.INDIVIDUAL:
+            for v in videos:
+                seen: list[int] = []
+                for video_name, ident in all_group_keys:
+                    if video_name == v and ident not in seen:
+                        seen.append(ident)
+                for ident in seen:
+                    key = (v, ident)
+                    if key not in key_to_gid:
+                        key_to_gid[key] = gid
+                        group_mapping[gid] = {"video": v, "identity": ident}
+                        gid += 1
+        elif grouping_strategy == CrossValidationGroupingStrategy.VIDEO:
+            video_to_gid: dict[str, int] = {}
+            for v in videos:
+                if v not in video_to_gid:
+                    video_to_gid[v] = gid
+                    group_mapping[gid] = {"video": v, "identity": None}
+                    gid += 1
+                for video_name, ident in all_group_keys:
+                    if video_name == v:
+                        key_to_gid[(v, ident)] = video_to_gid[v]
+        else:
+            raise ValueError(f"Unknown grouping strategy: {grouping_strategy}")
+        return key_to_gid, group_mapping
+
+    @staticmethod
+    def _build_groups_array(
+        all_group_keys: list[tuple[str, int]],
+        all_per_frame: list[pd.DataFrame],
+        key_to_gid: dict[tuple[str, int], int],
+    ) -> np.ndarray:
+        """Build the per-row group id array aligned to concatenated feature matrices."""
+        groups_list: list[np.ndarray] = [
+            np.full(df.shape[0], key_to_gid[key], dtype=np.int32)
+            for key, df in zip(all_group_keys, all_per_frame, strict=True)
+        ]
+        return np.concatenate(groups_list) if groups_list else np.array([], dtype=np.int32)
+
     def get_labeled_features(
         self,
         behavior: str | None = None,
@@ -836,22 +964,11 @@ def get_labeled_features(
                 The second dict maps group ids to their source:
                     { <group id>: {'video': <video filename>, 'identity': <identity>}, ... }
         """
-        # Parallel per-video feature collection using process workers.
-        # Progress increments once per video.
-        all_per_frame: list[pd.DataFrame] = []
-        all_window: list[pd.DataFrame] = []
-        all_labels: list[np.ndarray] = []
-        all_group_keys: list[tuple[str, int]] = []
-
-        # Snapshot behavior settings once
         behavior_settings = self._settings_manager.get_behavior(behavior)
         videos = list(self._video_manager.videos)
-
-        # get the cross validation grouping strategy from project settings
         if grouping_strategy is None:
             grouping_strategy = self.settings_manager.cv_grouping_strategy
 
-        # Early exit if no videos
         if not videos:
             return {
                 "window": pd.DataFrame(),
@@ -860,73 +977,27 @@ def get_labeled_features(
                 "groups": np.array([], dtype=np.int32),
             }, {}
 
-        # Prepare per-video jobs with Path types (workers open resources)
-        jobs: list[FeatureLoadJobSpec] = []
+        jobs: list[BinaryFeatureLoadJobSpec] = []
         for video in videos:
             if should_terminate_callable:
                 should_terminate_callable()
-
-            job: FeatureLoadJobSpec = {
-                "video": video,
-                "video_path": self._video_manager.video_path(video),
-                "pose_path": self._video_manager.get_cached_pose_path(video),
-                "annotations_path": self._paths.annotations_dir / Path(video).with_suffix(".json"),
-                "feature_dir": self.feature_dir,
-                "cache_dir": self._paths.cache_dir,
-                "behavior_settings": behavior_settings,
+            job: BinaryFeatureLoadJobSpec = {
+                **self._build_feature_load_job_base(video, behavior_settings),
                 "behavior_name": behavior,
-                "behavior_names": None,
-                "classifier_mode": ClassifierMode.BINARY,
-                "cache_format": self.cache_format.value,
             }
             jobs.append(job)
 
-        executor = self._process_pool
-        results_by_video: dict[str, dict] = {}
-
-        if executor is not None:
-            # Parallel execution using the process pool
-            future_to_video = {
-                executor.submit(collect_labeled_features, job): job["video"] for job in jobs
-            }
-
-            for future in as_completed(future_to_video):
-                # check for early exit
-                if should_terminate_callable:
-                    should_terminate_callable()
-
-                video_name = future_to_video[future]
-                try:
-                    res = future.result()
-                except Exception as e:
-                    raise RuntimeError(f"Feature collection failed for video: {video_name}") from e
-
-                # Stage results by video for deterministic finalization
-                results_by_video[video_name] = res
-
-                if progress_callable:
-                    progress_callable()  # once per video
-        else:
-            # Single-threaded execution
-            for job in jobs:
-                # check for early exit
-                if should_terminate_callable:
-                    should_terminate_callable()
-
-                try:
-                    res = collect_labeled_features(job)
-                except Exception as e:
-                    raise RuntimeError(
-                        f"Feature collection failed for video: {job['video']}"
-                    ) from e
-
-                # Stage results by video for deterministic finalization
-                results_by_video[job["video"]] = res
-
-                if progress_callable:
-                    progress_callable()  # once per video
+        results_by_video = self._collect_features_parallel(
+            jobs,
+            collect_binary_labeled_features,
+            progress_callable,
+            should_terminate_callable,
+        )
 
-        # Deterministic finalize: append results in original 'videos' order
+        all_per_frame: list[pd.DataFrame] = []
+        all_window: list[pd.DataFrame] = []
+        all_labels: list[np.ndarray] = []
+        all_group_keys: list[tuple[str, int]] = []
         for video in videos:
             if video not in results_by_video:
                 continue
@@ -936,7 +1007,6 @@ def get_labeled_features(
             all_labels.extend(res["labels"])
             all_group_keys.extend(res["group_keys"])
 
-        # If nothing was produced anywhere, return empty structures
         if not (all_per_frame and all_window and all_labels):
             return {
                 "window": pd.DataFrame(),
@@ -945,55 +1015,14 @@ def get_labeled_features(
                 "groups": np.array([], dtype=np.int32),
             }, {}
 
-        # Build stable group ids based on grouping strategy
-        key_to_gid: dict[tuple[str, int], int] = {}
-        video_to_gid: dict[str, int] = {}
-        gid = 0
-        if grouping_strategy == CrossValidationGroupingStrategy.INDIVIDUAL:
-            for v in videos:
-                seen: list[int] = []
-                for video_name, ident in all_group_keys:
-                    if video_name == v and ident not in seen:
-                        seen.append(ident)
-                for ident in seen:
-                    key = (v, ident)
-                    if key not in key_to_gid:
-                        key_to_gid[key] = gid
-                        gid += 1
-        elif grouping_strategy == CrossValidationGroupingStrategy.VIDEO:
-            for v in videos:
-                if v not in video_to_gid:
-                    video_to_gid[v] = gid
-                    gid += 1
-                for video_name, ident in all_group_keys:
-                    if video_name == v:
-                        key = (v, ident)
-                        key_to_gid[key] = video_to_gid[v]
-        else:
-            raise ValueError(f"Unknown grouping strategy: {grouping_strategy}")
-
-        # groups vector aligned with all_per_frame entries
-        groups_list: list[np.ndarray] = [
-            np.full(df.shape[0], key_to_gid[key], dtype=np.int32)
-            for key, df in zip(all_group_keys, all_per_frame, strict=True)
-        ]
-        groups = np.concatenate(groups_list) if groups_list else np.array([], dtype=np.int32)
-
-        # group_mapping: for INDIVIDUAL, maps gid to (video, identity); for VIDEO, maps gid to video only
-        if grouping_strategy == CrossValidationGroupingStrategy.INDIVIDUAL:
-            group_mapping: dict[int, dict[str, int | str]] = {
-                gid: {"video": v, "identity": ident} for (v, ident), gid in key_to_gid.items()
-            }
-        else:
-            group_mapping: dict[int, dict[str, str | None]] = {
-                gid: {"video": v, "identity": None} for v, gid in video_to_gid.items()
-            }
-
+        key_to_gid, group_mapping = self._assign_cv_group_ids(
+            all_group_keys, videos, grouping_strategy
+        )
+        groups = self._build_groups_array(all_group_keys, all_per_frame, key_to_gid)
         window_df = pd.concat(all_window, join="inner")
         per_frame_df = pd.concat(all_per_frame, join="inner")
         labels_arr = np.concatenate(all_labels)
 
-        # Sanity check: ensure all outputs are aligned
         if not (len(labels_arr) == per_frame_df.shape[0] == window_df.shape[0] == groups.shape[0]):
             raise RuntimeError(
                 "Mismatch among labels/per_frame/window/groups lengths: "
@@ -1038,16 +1067,10 @@ def get_multiclass_labeled_features(
                 - ``labels_by_behavior``: dict[str, np.ndarray] of aligned labels
                 - ``groups``: np.ndarray of group ids
         """
-        all_per_frame: list[pd.DataFrame] = []
-        all_window: list[pd.DataFrame] = []
-        all_labels_by_behavior: dict[str, list[np.ndarray]] = {}
-        all_group_keys: list[tuple[str, int]] = []
-
         behavior_names = list(self.settings_manager.behavior_names)
         if behavior_settings is None:
             behavior_settings = self.get_project_defaults()
         videos = list(self._video_manager.videos)
-
         if grouping_strategy is None:
             grouping_strategy = self.settings_manager.cv_grouping_strategy
 
@@ -1059,72 +1082,34 @@ def get_multiclass_labeled_features(
                 "groups": np.array([], dtype=np.int32),
             }, {}
 
-        jobs: list[FeatureLoadJobSpec] = []
+        jobs: list[MulticlassFeatureLoadJobSpec] = []
         for video in videos:
             if should_terminate_callable:
                 should_terminate_callable()
-
-            job: FeatureLoadJobSpec = {
-                "video": video,
-                "video_path": self._video_manager.video_path(video),
-                "pose_path": self._video_manager.get_cached_pose_path(video),
-                "annotations_path": self._paths.annotations_dir / Path(video).with_suffix(".json"),
-                "feature_dir": self.feature_dir,
-                "cache_dir": self._paths.cache_dir,
-                "behavior_settings": behavior_settings,
-                "behavior_name": None,
+            job: MulticlassFeatureLoadJobSpec = {
+                **self._build_feature_load_job_base(video, behavior_settings),
                 "behavior_names": behavior_names,
-                "classifier_mode": ClassifierMode.MULTICLASS,
-                "cache_format": self.cache_format.value,
             }
             jobs.append(job)
 
-        executor = self._process_pool
-        results_by_video: dict[str, dict] = {}
-
-        if executor is not None:
-            future_to_video = {
-                executor.submit(collect_labeled_features, job): job["video"] for job in jobs
-            }
-
-            for future in as_completed(future_to_video):
-                if should_terminate_callable:
-                    should_terminate_callable()
-
-                video_name = future_to_video[future]
-                try:
-                    res = future.result()
-                except Exception as e:
-                    raise RuntimeError(f"Feature collection failed for video: {video_name}") from e
-
-                results_by_video[video_name] = res
-
-                if progress_callable:
-                    progress_callable()
-        else:
-            for job in jobs:
-                if should_terminate_callable:
-                    should_terminate_callable()
-                try:
-                    res = collect_labeled_features(job)
-                except Exception as e:
-                    raise RuntimeError(
-                        f"Feature collection failed for video: {job['video']}"
-                    ) from e
-
-                results_by_video[job["video"]] = res
-
-                if progress_callable:
-                    progress_callable()
+        results_by_video = self._collect_features_parallel(
+            jobs,
+            collect_multiclass_labeled_features,
+            progress_callable,
+            should_terminate_callable,
+        )
 
-        # Finalize in the original `videos` order so group ids are deterministic.
+        all_per_frame: list[pd.DataFrame] = []
+        all_window: list[pd.DataFrame] = []
+        all_labels_by_behavior: dict[str, list[np.ndarray]] = {}
+        all_group_keys: list[tuple[str, int]] = []
         for video in videos:
             if video not in results_by_video:
                 continue
             res = results_by_video[video]
             per_frame_items = res["per_frame"]
             window_items = res["window"]
-            labels_by_behavior_items = res["labels_by_behavior"] or []
+            labels_by_behavior_items = res["labels_by_behavior"]
             group_keys_items = res["group_keys"]
 
             if not (
@@ -1144,8 +1129,8 @@ def get_multiclass_labeled_features(
             all_per_frame.extend(per_frame_items)
             all_window.extend(window_items)
             all_group_keys.extend(group_keys_items)
-            # Preserve per-identity row alignment by appending label slices in the
-            # same order as per_frame/window/group entries.
+            # Append per-identity label slices in the same order as per_frame/window
+            # entries so they stay row-aligned after concatenation.
             for labels_by_behavior in labels_by_behavior_items:
                 for name, arr in labels_by_behavior.items():
                     all_labels_by_behavior.setdefault(name, []).append(arr)
@@ -1158,48 +1143,10 @@ def get_multiclass_labeled_features(
                 "groups": np.array([], dtype=np.int32),
             }, {}
 
-        key_to_gid: dict[tuple[str, int], int] = {}
-        video_to_gid: dict[str, int] = {}
-        gid = 0
-        if grouping_strategy == CrossValidationGroupingStrategy.INDIVIDUAL:
-            for v in videos:
-                seen: list[int] = []
-                for video_name, ident in all_group_keys:
-                    if video_name == v and ident not in seen:
-                        seen.append(ident)
-                for ident in seen:
-                    key = (v, ident)
-                    if key not in key_to_gid:
-                        key_to_gid[key] = gid
-                        gid += 1
-        elif grouping_strategy == CrossValidationGroupingStrategy.VIDEO:
-            # All identities from the same video share one CV group id.
-            for v in videos:
-                if v not in video_to_gid:
-                    video_to_gid[v] = gid
-                    gid += 1
-                for video_name, ident in all_group_keys:
-                    if video_name == v:
-                        key = (v, ident)
-                        key_to_gid[key] = video_to_gid[v]
-        else:
-            raise ValueError(f"Unknown grouping strategy: {grouping_strategy}")
-
-        groups_list: list[np.ndarray] = [
-            np.full(df.shape[0], key_to_gid[key], dtype=np.int32)
-            for key, df in zip(all_group_keys, all_per_frame, strict=True)
-        ]
-        groups = np.concatenate(groups_list) if groups_list else np.array([], dtype=np.int32)
-
-        if grouping_strategy == CrossValidationGroupingStrategy.INDIVIDUAL:
-            group_mapping: dict[int, dict[str, int | str]] = {
-                gid: {"video": v, "identity": ident} for (v, ident), gid in key_to_gid.items()
-            }
-        else:
-            group_mapping: dict[int, dict[str, str | None]] = {
-                gid: {"video": v, "identity": None} for v, gid in video_to_gid.items()
-            }
-
+        key_to_gid, group_mapping = self._assign_cv_group_ids(
+            all_group_keys, videos, grouping_strategy
+        )
+        groups = self._build_groups_array(all_group_keys, all_per_frame, key_to_gid)
         window_df = pd.concat(all_window, join="inner")
         per_frame_df = pd.concat(all_per_frame, join="inner")
         n_rows = per_frame_df.shape[0]
@@ -1208,17 +1155,13 @@ def get_multiclass_labeled_features(
             for name, arrays in all_labels_by_behavior.items()
         }
 
-        expected_labels = {
-            MULTICLASS_NONE_BEHAVIOR,
-            *behavior_names,
-        }
-        missing_labels = expected_labels.difference(labels_by_behavior_arr.keys())
-        if missing_labels:
-            # Keep a stable key set for downstream multi-class consumers.
-            for missing in missing_labels:
-                labels_by_behavior_arr[missing] = np.full(
-                    n_rows, TrackLabels.Label.NONE, dtype=np.int8
-                )
+        # Ensure every expected behavior key is present so downstream consumers
+        # can rely on a stable key set even if a behavior had no labels anywhere.
+        expected_labels = {MULTICLASS_NONE_BEHAVIOR, *behavior_names}
+        for missing in expected_labels.difference(labels_by_behavior_arr.keys()):
+            labels_by_behavior_arr[missing] = np.full(
+                n_rows, TrackLabels.Label.NONE, dtype=np.int8
+            )
 
         if not (n_rows == window_df.shape[0] == groups.shape[0]):
             raise RuntimeError(
@@ -1226,7 +1169,6 @@ def get_multiclass_labeled_features(
                 f"per_frame={n_rows}, window={window_df.shape[0]}, groups={groups.shape[0]}"
             )
         for name, arr in labels_by_behavior_arr.items():
-            # Every behavior label vector must stay row-aligned with feature matrices.
             if arr.shape[0] != n_rows:
                 raise RuntimeError(
                     "Mismatch between multiclass label rows and features: "
diff --git a/src/jabs/ui/classification_thread.py b/src/jabs/ui/classification_thread.py
index f8f12e33..42748a0d 100644
--- a/src/jabs/ui/classification_thread.py
+++ b/src/jabs/ui/classification_thread.py
@@ -1,18 +1,20 @@
 import time
-from typing import cast
 
 import numpy as np
 import pandas as pd
 from PySide6.QtCore import QThread, Signal
 from PySide6.QtWidgets import QWidget
 
-from jabs.behavior.postprocessing import PostprocessingPipeline
 from jabs.classifier import Classifier, MultiClassClassifier
 from jabs.core.enums import ClassifierMode
 from jabs.feature_extraction import DEFAULT_WINDOW_SIZE, IdentityFeatures
 from jabs.project import Project
-from jabs.project.prediction_manager import MULTICLASS_PREDICTION_KEY
 
+from .classify_strategy import (
+    BinaryClassifyStrategy,
+    ClassifyStrategy,
+    MultiClassClassifyStrategy,
+)
 from .exceptions import ThreadTerminatedError
 
 
@@ -21,10 +23,11 @@ class ClassifyThread(QThread):
     Thread used to run classification in the background, keeping the Qt main GUI thread responsive.
 
     Signals:
-        classification_complete: QtCore.Signal(dict)
-            Emitted when classification is finished successfully. The emitted dict
-            contains predictions, probabilities, and frame indexes for the current video so that
-            the UI can update accordingly.
+        classification_complete: QtCore.Signal(dict, int)
+            Emitted when classification is finished successfully. The dict carries
+            predictions, probabilities, post-processed predictions, and class_names
+            for the current video so the UI can update; the int is the elapsed
+            wall-clock time in milliseconds.
         current_status: QtCore.Signal(str)
             Emitted to update the main GUI thread with a status message (e.g., for a status bar).
         update_progress: QtCore.Signal(int)
@@ -35,7 +38,7 @@ class ClassifyThread(QThread):
             to the main GUI thread.
 
     Args:
-        classifier (Classifier): The classifier instance to use for predictions.
+        classifier (Classifier | MultiClassClassifier): The classifier to use for predictions.
         project (Project): The project containing data and settings.
         behavior (str): The behavior label to classify.
         current_video (str): The video currently loaded in the video player.
@@ -82,15 +85,29 @@ def request_termination(self) -> None:
         """
         self._should_terminate = True
 
+    def _build_strategy(self) -> ClassifyStrategy:
+        """Construct the per-mode classification strategy for this run."""
+        if self._project.settings_manager.classifier_mode == ClassifierMode.MULTICLASS:
+            return MultiClassClassifyStrategy(
+                classifier=self._classifier,
+                project=self._project,
+                behavior=self._behavior,
+            )
+        return BinaryClassifyStrategy(
+            classifier=self._classifier,
+            project=self._project,
+            behavior=self._behavior,
+        )
+
     def run(self) -> None:
-        """thread's main function.
+        """Thread's main function.
 
-        runs the classifier for each identity in each video
+        Runs the classifier for each identity in each video.
         """
         self._tasks_complete = 0
-        current_video_predictions = {}
-        current_video_probabilities = {}
-        current_video_predictions_postprocessed = {}
+        current_video_predictions: dict = {}
+        current_video_probabilities: dict = {}
+        current_video_predictions_postprocessed: dict = {}
         t0_ns = time.perf_counter_ns()
 
         def check_termination_requested() -> None:
@@ -98,28 +115,9 @@ def check_termination_requested() -> None:
                 raise ThreadTerminatedError("Classification was cancelled by the user")
 
         try:
-            multiclass_mode = (
-                self._project.settings_manager.classifier_mode == ClassifierMode.MULTICLASS
-            )
-            if multiclass_mode:
-                multiclass_classifier = cast(MultiClassClassifier, self._classifier)
-                # Multiclass classification reuses one shared classifier/settings bundle
-                # across all behaviors and writes a single reserved prediction record.
-                project_settings = (
-                    multiclass_classifier.project_settings or self._project.get_project_defaults()
-                )
-                prediction_behavior = MULTICLASS_PREDICTION_KEY
-                class_names = multiclass_classifier.get_class_names()
-                postprocessing_pipeline = None
-            else:
-                project_settings = self._project.settings_manager.get_behavior(self._behavior)
-                prediction_behavior = self._behavior
-                class_names = None
-                postprocessing_pipeline = PostprocessingPipeline(
-                    project_settings.get("postprocessing", [])
-                )
+            strategy = self._build_strategy()
+            project_settings = strategy.project_settings()
 
-            # iterate over each video in the project
             for video in self._project.video_manager.videos:
                 check_termination_requested()
 
@@ -127,17 +125,15 @@ def check_termination_requested() -> None:
                 pose_est = self._project.load_pose_est(video_path)
                 fps = pose_est.fps
 
-                # collect predictions, probabilities, and frame indexes for each identity in the video
-                predictions = {}
-                probabilities = {}
-                postprocessed_predictions = {}
+                predictions: dict = {}
+                probabilities: dict = {}
+                postprocessed_predictions: dict = {}
 
                 for identity in pose_est.identities:
                     check_termination_requested()
 
                     self.current_status.emit(f"Classifying {video},  Identity {identity}")
 
-                    # get the features for this identity
                     features = IdentityFeatures(
                         video,
                         identity,
@@ -158,70 +154,54 @@ def check_termination_requested() -> None:
 
                     check_termination_requested()
                     if data.shape[0] > 0:
-                        # Get probabilities for all classes
                         prob = self._classifier.predict_proba(
                             data, feature_values["frame_indexes"]
                         )
-
                         predictions[identity], confidence = self._classifier.derive_predictions(
                             prob
                         )
-                        # Binary mode persists per-frame confidence; multiclass mode persists
-                        # the full class-probability matrix for timeline decomposition.
-                        probabilities[identity] = prob if multiclass_mode else confidence
+                        probabilities[identity] = strategy.probabilities_for_storage(
+                            prob, confidence
+                        )
                     else:
                         predictions[identity] = np.full(pose_est.num_frames, -1, dtype=np.int8)
-                        if multiclass_mode:
-                            probabilities[identity] = np.zeros(
-                                (pose_est.num_frames, len(class_names)),
-                                dtype=np.float32,
-                            )
-                        else:
-                            probabilities[identity] = np.zeros(
-                                pose_est.num_frames, dtype=np.float32
-                            )
-
-                    if not multiclass_mode and postprocessing_pipeline is not None:
-                        # Post-processing semantics are currently binary-only
-                        postprocessed_predictions[identity] = postprocessing_pipeline.run(
-                            predictions[identity], probabilities[identity]
-                        )
+                        probabilities[identity] = strategy.empty_probabilities(pose_est.num_frames)
+
+                    postprocessed = strategy.postprocess_identity(
+                        predictions[identity], probabilities[identity]
+                    )
+                    if postprocessed is not None:
+                        postprocessed_predictions[identity] = postprocessed
 
                 if video == self._current_video:
-                    # keep predictions for the video currently loaded in the video player
                     current_video_predictions = predictions
                     current_video_probabilities = probabilities
                     current_video_predictions_postprocessed = postprocessed_predictions
 
-                # save predictions to disk
                 self.current_status.emit("Saving Predictions")
                 self._project.save_predictions(
                     pose_est,
                     video,
                     predictions,
                     probabilities,
-                    prediction_behavior,
+                    strategy.prediction_behavior(),
                     self._classifier,
                     postprocessed_predictions=postprocessed_predictions,
-                    class_names=class_names,
+                    class_names=strategy.class_names(),
                 )
 
                 self._tasks_complete += 1
                 self.update_progress.emit(self._tasks_complete)
 
             elapsed_ms = int((time.perf_counter_ns() - t0_ns) // 1_000_000)
-            # emits the predictions, probabilities, and frame indexes for the video currently loaded in
-            # the video player, so that it can update the UI accordingly to show the new predictions
             self.classification_complete.emit(
                 {
                     "predictions": current_video_predictions,
                     "probabilities": current_video_probabilities,
                     "predictions_postprocessed": current_video_predictions_postprocessed,
-                    "class_names": class_names,
+                    "class_names": strategy.class_names(),
                 },
                 elapsed_ms,
             )
         except Exception as e:
-            # if there was an exception, we'll emit the Exception as a signal so that
-            # the main GUI thread can handle it
             self.error_callback.emit(e)
diff --git a/src/jabs/ui/classify_strategy.py b/src/jabs/ui/classify_strategy.py
new file mode 100644
index 00000000..9f111815
--- /dev/null
+++ b/src/jabs/ui/classify_strategy.py
@@ -0,0 +1,183 @@
+"""Per-mode classification strategies used by :class:`ClassifyThread`.
+
+Two strategies - :class:`BinaryClassifyStrategy` and
+:class:`MultiClassClassifyStrategy` - implement the mode-specific pieces of
+the classification pipeline (effective settings, the behavior key under which
+predictions are persisted, the optional class-name list, the per-identity
+probabilities to store, the zero-fill shape used when an identity has no
+data, and the optional postprocessing pipeline). The orchestrator in
+:class:`ClassifyThread` consumes the strategy and stays mode-agnostic.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, cast
+
+import numpy as np
+import numpy.typing as npt
+
+from jabs.behavior.postprocessing import PostprocessingPipeline
+from jabs.classifier import Classifier, MultiClassClassifier
+from jabs.project.prediction_manager import MULTICLASS_PREDICTION_KEY
+
+if TYPE_CHECKING:
+    from jabs.project import Project
+
+
+class ClassifyStrategy:
+    """Per-mode hooks for the classification pipeline."""
+
+    def __init__(
+        self,
+        classifier: Classifier | MultiClassClassifier,
+        project: Project,
+        behavior: str,
+    ) -> None:
+        self._classifier = classifier
+        self._project = project
+        self._behavior = behavior
+
+    def project_settings(self) -> dict:
+        """Return the settings used for feature extraction and inference."""
+        raise NotImplementedError
+
+    def prediction_behavior(self) -> str:
+        """Return the behavior key under which predictions are persisted."""
+        raise NotImplementedError
+
+    def class_names(self) -> list[str] | None:
+        """Return the class-name list emitted on completion and saved to disk.
+
+        ``None`` for binary mode, where the saved record uses the behavior key
+        and there is no per-class probability matrix.
+        """
+        raise NotImplementedError
+
+    def probabilities_for_storage(
+        self,
+        prob: npt.NDArray[np.float32],
+        confidence: npt.NDArray[np.float32],
+    ) -> npt.NDArray[np.float32]:
+        """Return the per-frame probabilities to persist for this identity.
+
+        Args:
+            prob: Full per-class probability matrix from ``predict_proba``,
+                shape ``(n_frames, n_classes)``.
+            confidence: Per-frame confidence of the chosen class, shape
+                ``(n_frames,)``.
+
+        Returns:
+            The probabilities to persist - shape depends on mode.
+        """
+        raise NotImplementedError
+
+    def empty_probabilities(self, num_frames: int) -> npt.NDArray[np.float32]:
+        """Return a zero-filled probabilities array for an identity with no data."""
+        raise NotImplementedError
+
+    def postprocess_identity(
+        self,
+        predictions: npt.NDArray[np.int8],
+        probabilities: npt.NDArray[np.float32],
+    ) -> npt.NDArray[np.int8] | None:
+        """Postprocessed predictions for this identity, or ``None`` if not applicable."""
+        raise NotImplementedError
+
+
+class BinaryClassifyStrategy(ClassifyStrategy):
+    """Classification pipeline for the binary behavior-vs-not-behavior classifier."""
+
+    def __init__(
+        self,
+        classifier: Classifier,
+        project: Project,
+        behavior: str,
+    ) -> None:
+        super().__init__(classifier, project, behavior)
+        self._project_settings = project.settings_manager.get_behavior(behavior)
+        self._postprocessing_pipeline = PostprocessingPipeline(
+            self._project_settings.get("postprocessing", [])
+        )
+
+    def project_settings(self) -> dict:
+        """Return the behavior-scoped settings from the project settings manager."""
+        return self._project_settings
+
+    def prediction_behavior(self) -> str:
+        """Return the behavior label used as the prediction record key."""
+        return self._behavior
+
+    def class_names(self) -> list[str] | None:
+        """Binary mode does not persist class names."""
+        return None
+
+    def probabilities_for_storage(
+        self,
+        prob: npt.NDArray[np.float32],
+        confidence: npt.NDArray[np.float32],
+    ) -> npt.NDArray[np.float32]:
+        """Binary mode persists the per-frame confidence of the chosen class."""
+        return confidence
+
+    def empty_probabilities(self, num_frames: int) -> npt.NDArray[np.float32]:
+        """One-dimensional zero array sized to the video's frame count."""
+        return np.zeros(num_frames, dtype=np.float32)
+
+    def postprocess_identity(
+        self,
+        predictions: npt.NDArray[np.int8],
+        probabilities: npt.NDArray[np.float32],
+    ) -> npt.NDArray[np.int8] | None:
+        """Run the configured postprocessing pipeline for this identity."""
+        return self._postprocessing_pipeline.run(predictions, probabilities)
+
+
+class MultiClassClassifyStrategy(ClassifyStrategy):
+    """Classification pipeline for the multi-class behavior classifier."""
+
+    def __init__(
+        self,
+        classifier: MultiClassClassifier,
+        project: Project,
+        behavior: str,
+    ) -> None:
+        super().__init__(classifier, project, behavior)
+        multiclass_classifier = cast(MultiClassClassifier, classifier)
+        # Multiclass shares one settings bundle across every behavior; fall back
+        # to project defaults if the classifier was constructed without one.
+        self._project_settings = (
+            multiclass_classifier.project_settings or project.get_project_defaults()
+        )
+        self._class_names: list[str] = multiclass_classifier.get_class_names()
+
+    def project_settings(self) -> dict:
+        """Return the captured project settings used at construction time."""
+        return self._project_settings
+
+    def prediction_behavior(self) -> str:
+        """Return the reserved multi-class prediction record key."""
+        return MULTICLASS_PREDICTION_KEY
+
+    def class_names(self) -> list[str] | None:
+        """Return ``[MULTICLASS_NONE_BEHAVIOR, *behavior_names]``."""
+        return self._class_names
+
+    def probabilities_for_storage(
+        self,
+        prob: npt.NDArray[np.float32],
+        confidence: npt.NDArray[np.float32],
+    ) -> npt.NDArray[np.float32]:
+        """Multi-class mode persists the full per-class probability matrix."""
+        return prob
+
+    def empty_probabilities(self, num_frames: int) -> npt.NDArray[np.float32]:
+        """Two-dimensional zero array shaped ``(num_frames, n_classes)``."""
+        return np.zeros((num_frames, len(self._class_names)), dtype=np.float32)
+
+    def postprocess_identity(
+        self,
+        predictions: npt.NDArray[np.int8],
+        probabilities: npt.NDArray[np.float32],
+    ) -> npt.NDArray[np.int8] | None:
+        """Post-processing semantics are currently binary-only."""
+        return None
diff --git a/src/jabs/ui/main_window/central_widget.py b/src/jabs/ui/main_window/central_widget.py
index 54381d8a..ec2c0801 100644
--- a/src/jabs/ui/main_window/central_widget.py
+++ b/src/jabs/ui/main_window/central_widget.py
@@ -16,7 +16,6 @@
 from jabs.core.enums import (
     ClassifierMode,
     ClassifierType,
-    CrossValidationGroupingStrategy,
     PredictionType,
 )
 from jabs.pose_estimation import PoseEstimation, PoseEstimationV8
@@ -25,7 +24,6 @@
 from ..behavior_timeline import (
     BehaviorTimelineWidget,
     binary_predictions_to_lut_indices,
-    track_labels_to_lut_indices,
 )
 from ..classification_thread import ClassifyThread
 from ..dialogs import AnnotationEditDialog, TrainingReportDialog
@@ -36,6 +34,7 @@
 from ..player_widget import PlayerWidget
 from ..search_bar_widget import SearchBarWidget
 from ..training_thread import TrainingThread
+from . import central_widget_mode
 
 _CLICK_THRESHOLD = 20
 _DEBOUNCE_SEARCH_DELAY_MS = 100
@@ -392,18 +391,17 @@ def load_video(self, path: Path) -> None:
             self._player_widget.load_video(path, self._pose_est, self._labels)
 
             # load saved predictions for this video
-            if self._project.settings_manager.classifier_mode == ClassifierMode.MULTICLASS:
-                (
-                    self._predictions,
-                    self._probabilities,
-                    self._predictions_postprocessed,
-                    self._multiclass_class_names,
-                ) = self._project.prediction_manager.load_multiclass_predictions(path.name)
-            else:
-                self._predictions, self._probabilities, self._predictions_postprocessed = (
-                    self._project.prediction_manager.load_predictions(path.name, self.behavior)
-                )
-                self._multiclass_class_names = None
+            (
+                self._predictions,
+                self._probabilities,
+                self._predictions_postprocessed,
+                self._multiclass_class_names,
+            ) = central_widget_mode.load_video_predictions(
+                self._project.prediction_manager,
+                self._project.settings_manager.classifier_mode,
+                path.name,
+                self.behavior,
+            )
 
             # update ui components with properties of new video
             display_identities = [
@@ -736,52 +734,46 @@ def _curr_selection_end(self) -> int:
     def _label_behavior(self) -> None:
         """Apply behavior label to currently selected range of frames."""
         start, end = sorted([self._selection_start, self._curr_selection_end])
-        if self._project.settings_manager.classifier_mode == ClassifierMode.MULTICLASS:
-            identity_str = str(self._controls.current_identity_index)
-            current_behavior = self._controls.current_behavior
-            for behavior, track in self._labels.iter_behavior_labels(identity_str):
-                if behavior != current_behavior:
-                    track.clear_labels(start, end)
+        identity_index = self._controls.current_identity_index
+        current_behavior = self._controls.current_behavior
+        central_widget_mode.apply_behavior_label(
+            self._labels,
+            self._project.settings_manager.classifier_mode,
+            str(identity_index),
+            current_behavior,
+            start,
+            end,
+        )
         self._project.session_tracker.label_created(
             self._loaded_video,
-            self._controls.current_identity_index,
-            self._controls.current_behavior,
+            identity_index,
+            current_behavior,
             True,
             start,
             end,
         )
-        self._get_label_track().label_behavior(start, end)
         self._label_button_common()
 
     def _label_not_behavior(self) -> None:
         """Apply not-behavior label (binary) or None label (multi-class) to selected frames."""
         start, end = sorted([self._selection_start, self._curr_selection_end])
-        if self._project.settings_manager.classifier_mode == ClassifierMode.MULTICLASS:
-            identity_str = str(self._controls.current_identity_index)
-            for behavior, track in self._labels.iter_behavior_labels(identity_str):
-                if behavior != MULTICLASS_NONE_BEHAVIOR:
-                    track.clear_labels(start, end)
-            self._project.session_tracker.label_created(
-                self._loaded_video,
-                self._controls.current_identity_index,
-                MULTICLASS_NONE_BEHAVIOR,
-                True,
-                start,
-                end,
-            )
-            self._labels.get_track_labels(identity_str, MULTICLASS_NONE_BEHAVIOR).label_behavior(
-                start, end
-            )
-        else:
-            self._project.session_tracker.label_created(
-                self._loaded_video,
-                self._controls.current_identity_index,
-                self._controls.current_behavior,
-                False,
-                start,
-                end,
-            )
-            self._get_label_track().label_not_behavior(start, end)
+        identity_index = self._controls.current_identity_index
+        behavior_key, is_positive_label = central_widget_mode.apply_not_behavior_label(
+            self._labels,
+            self._project.settings_manager.classifier_mode,
+            str(identity_index),
+            self._controls.current_behavior,
+            start,
+            end,
+        )
+        self._project.session_tracker.label_created(
+            self._loaded_video,
+            identity_index,
+            behavior_key,
+            is_positive_label,
+            start,
+            end,
+        )
         self._label_button_common()
 
     def _clear_behavior_label(self) -> None:
@@ -846,28 +838,26 @@ def _set_label_track(self) -> None:
             mask_list = [
                 self._pose_est.identity_mask(i) for i in range(self._pose_est.num_identities)
             ]
-            if self._project.settings_manager.classifier_mode == ClassifierMode.MULTICLASS:
-                behavior_names = self._controls.behaviors
-                multiclass_arrays = [
-                    self._labels.build_multiclass_label_array(str(i), behavior_names)
-                    for i in range(self._pose_est.num_identities)
-                ]
-                self._jabs_timeline.set_labels(multiclass_arrays, mask_list)
-                if self._label_overlay_mode == PlayerWidget.LabelOverlayMode.LABEL:
+            mode = self._project.settings_manager.classifier_mode
+            timeline_labels = central_widget_mode.build_timeline_label_arrays(
+                self._labels,
+                mode,
+                self._pose_est.num_identities,
+                current_behavior=behavior,
+                behaviors=self._controls.behaviors,
+            )
+            self._jabs_timeline.set_labels(timeline_labels, mask_list)
+            if self._label_overlay_mode == PlayerWidget.LabelOverlayMode.LABEL:
+                if mode == ClassifierMode.MULTICLASS:
                     lut = self._jabs_timeline.multiclass_color_lut
                     if lut is not None:
                         self._player_widget.set_label_color_lut(lut)
-                        self._player_widget.set_labels(multiclass_arrays)
+                        self._player_widget.set_labels(timeline_labels)
                     else:
                         self._player_widget.set_label_color_lut(None)
                         self._player_widget.set_labels(None)
-            else:
-                label_list = self._get_label_list()
-                self._jabs_timeline.set_labels(
-                    [track_labels_to_lut_indices(t) for t in label_list],
-                    mask_list,
-                )
-                if self._label_overlay_mode == PlayerWidget.LabelOverlayMode.LABEL:
+                else:
+                    label_list = self._get_label_list()
                     self._player_widget.set_label_color_lut(None)
                     self._player_widget.set_labels([labels.get_labels() for labels in label_list])
 
@@ -939,16 +929,20 @@ def _train_button_clicked(self) -> None:
         # use one task for reading features from each video, plus one for training, plus one each for cross validation iterations.
         total_steps = self._project.video_manager.num_videos + 1
         if self._controls.all_kfold:
-            if self._project.settings_manager.classifier_mode != ClassifierMode.MULTICLASS:
+            if self._project.settings_manager.classifier_mode == ClassifierMode.MULTICLASS:
+                behavior_names = [MULTICLASS_NONE_BEHAVIOR, *self._controls.behaviors]
+                counts_by_behavior = {name: self._project.counts(name) for name in behavior_names}
+                total_steps += MultiClassClassifier.count_label_threshold(
+                    counts_by_behavior=counts_by_behavior,
+                    behavior_names=behavior_names,
+                    cv_grouping_strategy=self._project.settings_manager.cv_grouping_strategy,
+                )
+            else:
                 project_counts = self._project.counts(self._controls.current_behavior)
                 total_steps += self._classifier.count_label_threshold(
                     project_counts,
                     cv_grouping_strategy=self._project.settings_manager.cv_grouping_strategy,
                 )
-            else:
-                # For multiclass all-kfold mode we do not know valid split count until
-                # feature extraction completes; start with a conservative estimate.
-                total_steps += self._project.video_manager.num_videos
         else:
             total_steps += self._controls.kfold_value
 
@@ -1368,7 +1362,15 @@ def set_train_button_enabled_state(self) -> None:
             return
 
         if self._project.settings_manager.classifier_mode == ClassifierMode.MULTICLASS:
-            threshold_met = self._multiclass_train_threshold_met()
+            behavior_names = [MULTICLASS_NONE_BEHAVIOR, *self._controls.behaviors]
+            counts_by_behavior = {name: self._project.counts(name) for name in behavior_names}
+            min_groups = 1 if self._controls.all_kfold else self._controls.kfold_value
+            threshold_met = MultiClassClassifier.label_threshold_met(
+                counts_by_behavior=counts_by_behavior,
+                behavior_names=behavior_names,
+                min_groups=min_groups,
+                cv_grouping_strategy=self._project.settings_manager.cv_grouping_strategy,
+            )
         else:
             threshold_met = Classifier.label_threshold_met(
                 self._counts,
@@ -1383,77 +1385,6 @@ def set_train_button_enabled_state(self) -> None:
             self._controls.train_button_enabled = False
             self.export_training_status_change.emit(False)
 
-    def _multiclass_train_threshold_met(self) -> bool:
-        """Return True when multiclass labels support the requested number of LOGO splits."""
-        behavior_names = [MULTICLASS_NONE_BEHAVIOR, *self._controls.behaviors]
-        if len(behavior_names) < 2:
-            return False
-
-        counts_by_behavior: dict[str, dict] = {}
-        for behavior_name in behavior_names:
-            counts_by_behavior[behavior_name] = self._project.counts(behavior_name)
-
-        valid_splits = self._count_multiclass_valid_logo_splits(
-            counts_by_behavior=counts_by_behavior,
-            behavior_names=behavior_names,
-            grouping_strategy=self._project.settings_manager.cv_grouping_strategy,
-            threshold=MultiClassClassifier.LABEL_THRESHOLD,
-        )
-        requested_splits = valid_splits if self._controls.all_kfold else self._controls.kfold_value
-        return valid_splits >= max(1, requested_splits)
-
-    @staticmethod
-    def _count_multiclass_valid_logo_splits(
-        counts_by_behavior: dict[str, dict],
-        behavior_names: list[str],
-        grouping_strategy: CrossValidationGroupingStrategy,
-        threshold: int,
-    ) -> int:
-        """Count valid multiclass LOGO splits using fragmented per-group frame counts."""
-        if not behavior_names:
-            return 0
-
-        group_class_counts: dict[tuple[str, int] | str, dict[str, int]] = {}
-        for behavior_name in behavior_names:
-            behavior_counts = counts_by_behavior.get(behavior_name, {})
-            for video_name, video_counts in behavior_counts.items():
-                if grouping_strategy == CrossValidationGroupingStrategy.VIDEO:
-                    key: tuple[str, int] | str = video_name
-                    group_entry = group_class_counts.setdefault(key, {})
-                    group_entry[behavior_name] = group_entry.get(behavior_name, 0) + sum(
-                        identity_counts["fragmented_frame_counts"][0]
-                        for identity_counts in video_counts.values()
-                    )
-                else:
-                    for identity, identity_counts in video_counts.items():
-                        key = (video_name, int(identity))
-                        group_entry = group_class_counts.setdefault(key, {})
-                        group_entry[behavior_name] = identity_counts["fragmented_frame_counts"][0]
-
-        if not group_class_counts:
-            return 0
-
-        total_by_class = {
-            class_name: sum(
-                group_counts.get(class_name, 0) for group_counts in group_class_counts.values()
-            )
-            for class_name in behavior_names
-        }
-
-        valid_groups = 0
-        for group_counts in group_class_counts.values():
-            n_test_classes = sum(
-                group_counts.get(class_name, 0) >= threshold for class_name in behavior_names
-            )
-            train_has_all_classes = all(
-                (total_by_class[class_name] - group_counts.get(class_name, 0)) >= threshold
-                for class_name in behavior_names
-            )
-            if n_test_classes >= 2 and train_has_all_classes:
-                valid_groups += 1
-
-        return valid_groups
-
     def _update_label_counts(self) -> None:
         """update the widget with the labeled frame / bout counts
 
diff --git a/src/jabs/ui/main_window/central_widget_mode.py b/src/jabs/ui/main_window/central_widget_mode.py
new file mode 100644
index 00000000..7eb818b0
--- /dev/null
+++ b/src/jabs/ui/main_window/central_widget_mode.py
@@ -0,0 +1,128 @@
+"""Per-mode dispatch helpers for CentralWidget label and prediction operations.
+
+CentralWidget hosts both binary and multi-class workflows. Each helper here
+encapsulates one operation that diverges by classifier mode so the widget
+itself can call the operation uniformly and the mode check lives in one place.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+import numpy.typing as npt
+
+from jabs.core.constants import MULTICLASS_NONE_BEHAVIOR
+from jabs.core.enums import ClassifierMode
+
+from ..behavior_timeline import track_labels_to_lut_indices
+
+if TYPE_CHECKING:
+    from jabs.project import VideoLabels
+    from jabs.project.prediction_manager import PredictionManager
+
+
+def load_video_predictions(
+    prediction_manager: PredictionManager,
+    mode: ClassifierMode,
+    video_name: str,
+    behavior: str,
+) -> tuple[
+    dict[int, np.ndarray],
+    dict[int, np.ndarray],
+    dict[int, np.ndarray],
+    list[str] | None,
+]:
+    """Load saved predictions for ``video_name``, dispatching by classifier mode.
+
+    Args:
+        prediction_manager: The project's prediction manager.
+        mode: Active classifier mode.
+        video_name: File name of the video to load predictions for.
+        behavior: Behavior key; only used in binary mode.
+
+    Returns:
+        ``(predictions, probabilities, postprocessed_predictions, class_names)``.
+        ``class_names`` is the ordered class list in multi-class mode, ``None``
+        in binary mode.
+    """
+    if mode == ClassifierMode.MULTICLASS:
+        return prediction_manager.load_multiclass_predictions(video_name)
+    predictions, probabilities, postprocessed = prediction_manager.load_predictions(
+        video_name, behavior
+    )
+    return predictions, probabilities, postprocessed, None
+
+
+def apply_behavior_label(
+    labels: VideoLabels,
+    mode: ClassifierMode,
+    identity_str: str,
+    current_behavior: str,
+    start: int,
+    end: int,
+) -> None:
+    """Apply the current-behavior label across ``[start, end]``.
+
+    In multi-class mode, competing behavior labels on the same range are
+    cleared first to maintain mutual exclusivity across classes. Binary mode
+    just labels the requested track.
+    """
+    if mode == ClassifierMode.MULTICLASS:
+        for behavior, track in labels.iter_behavior_labels(identity_str):
+            if behavior != current_behavior:
+                track.clear_labels(start, end)
+    labels.get_track_labels(identity_str, current_behavior).label_behavior(start, end)
+
+
+def apply_not_behavior_label(
+    labels: VideoLabels,
+    mode: ClassifierMode,
+    identity_str: str,
+    current_behavior: str,
+    start: int,
+    end: int,
+) -> tuple[str, bool]:
+    """Apply the mode-appropriate negative label.
+
+    In binary mode this is a true "not behavior" label on the current behavior
+    track. In multi-class mode the action becomes an explicit None label
+    (a positive label on the reserved :data:`MULTICLASS_NONE_BEHAVIOR` track);
+    competing behavior labels on the same range are cleared first.
+
+    Returns:
+        ``(behavior_key, is_positive_label)`` for session-tracker logging.
+        Binary mode returns ``(current_behavior, False)``; multi-class mode
+        returns ``(MULTICLASS_NONE_BEHAVIOR, True)``.
+    """
+    if mode == ClassifierMode.MULTICLASS:
+        for behavior, track in labels.iter_behavior_labels(identity_str):
+            if behavior != MULTICLASS_NONE_BEHAVIOR:
+                track.clear_labels(start, end)
+        labels.get_track_labels(identity_str, MULTICLASS_NONE_BEHAVIOR).label_behavior(start, end)
+        return MULTICLASS_NONE_BEHAVIOR, True
+    labels.get_track_labels(identity_str, current_behavior).label_not_behavior(start, end)
+    return current_behavior, False
+
+
+def build_timeline_label_arrays(
+    labels: VideoLabels,
+    mode: ClassifierMode,
+    num_identities: int,
+    current_behavior: str,
+    behaviors: list[str],
+) -> list[npt.NDArray]:
+    """Build per-identity label arrays for the timeline widget.
+
+    Binary mode returns one LUT-index array per identity covering
+    ``current_behavior`` only. Multi-class mode returns one merged
+    multi-class label array per identity covering all ``behaviors``.
+    """
+    if mode == ClassifierMode.MULTICLASS:
+        return [
+            labels.build_multiclass_label_array(str(i), behaviors) for i in range(num_identities)
+        ]
+    return [
+        track_labels_to_lut_indices(labels.get_track_labels(str(i), current_behavior))
+        for i in range(num_identities)
+    ]
diff --git a/src/jabs/ui/training_strategy.py b/src/jabs/ui/training_strategy.py
new file mode 100644
index 00000000..6d75ac7d
--- /dev/null
+++ b/src/jabs/ui/training_strategy.py
@@ -0,0 +1,277 @@
+"""Per-mode training strategies used by :class:`TrainingThread`.
+
+Two strategies — :class:`BinaryTrainingStrategy` and
+:class:`MultiClassTrainingStrategy` — implement the mode-specific pieces of
+the training pipeline (feature collection, final-train data dict, classifier
+save target, report content, and the secondary CV metric reported to the
+session tracker). The orchestrator in :class:`TrainingThread` consumes the
+strategy and stays mode-agnostic.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from datetime import datetime
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pandas as pd
+
+from jabs.classifier import (
+    Classifier,
+    MultiClassClassifier,
+    TrainingReportData,
+    classifier_utils,
+)
+
+if TYPE_CHECKING:
+    from jabs.classifier import CrossValidationResult
+    from jabs.core.enums import CrossValidationGroupingStrategy
+    from jabs.project import Project
+
+
+class TrainingStrategy:
+    """Per-mode hooks for the classifier training pipeline."""
+
+    def __init__(
+        self,
+        classifier: Classifier | MultiClassClassifier,
+        project: Project,
+        behavior: str,
+    ) -> None:
+        self._classifier = classifier
+        self._project = project
+        self._behavior = behavior
+
+    def collect_features(
+        self,
+        progress_callable: Callable[[], None],
+        should_terminate_callable: Callable[[], None],
+    ) -> tuple[dict, dict]:
+        """Collect labeled features and group mapping for this mode."""
+        raise NotImplementedError
+
+    def effective_settings(self) -> dict:
+        """Return the settings used for both feature extraction and training."""
+        raise NotImplementedError
+
+    def final_train_data(
+        self,
+        features: dict,
+        full_dataset: pd.DataFrame,
+        feature_names: list[str],
+    ) -> dict:
+        """Build the data dict passed to ``classifier.train`` for the final model."""
+        raise NotImplementedError
+
+    def save_classifier(self) -> None:
+        """Persist the trained classifier."""
+        raise NotImplementedError
+
+    def build_report_data(
+        self,
+        features: dict,
+        cv_results: list[CrossValidationResult],
+        final_top_features: list[tuple[str, float]],
+        elapsed_ms: int,
+        timestamp: datetime,
+        cv_grouping_strategy: CrossValidationGroupingStrategy,
+        distance_unit: str,
+        settings: dict,
+    ) -> TrainingReportData:
+        """Assemble the ``TrainingReportData`` for the trained model."""
+        raise NotImplementedError
+
+    def cv_secondary_metric(self, cv_results: list[CrossValidationResult]) -> float | None:
+        """Mean of the mode-specific secondary CV metric, or ``None``."""
+        raise NotImplementedError
+
+
+class BinaryTrainingStrategy(TrainingStrategy):
+    """Training pipeline for the binary behavior-vs-not-behavior classifier."""
+
+    def __init__(
+        self,
+        classifier: Classifier,
+        project: Project,
+        behavior: str,
+        bout_counts: tuple[int, int],
+    ) -> None:
+        super().__init__(classifier, project, behavior)
+        self._bout_counts = bout_counts
+
+    def collect_features(
+        self,
+        progress_callable: Callable[[], None],
+        should_terminate_callable: Callable[[], None],
+    ) -> tuple[dict, dict]:
+        """Collect labeled features for the configured behavior."""
+        return self._project.get_labeled_features(
+            self._behavior,
+            progress_callable=progress_callable,
+            should_terminate_callable=should_terminate_callable,
+        )
+
+    def effective_settings(self) -> dict:
+        """Return the behavior-scoped settings from the project settings manager."""
+        return self._project.settings_manager.get_behavior(self._behavior)
+
+    def final_train_data(
+        self,
+        features: dict,
+        full_dataset: pd.DataFrame,
+        feature_names: list[str],
+    ) -> dict:
+        """Build the binary ``classifier.train`` payload from the combined dataset."""
+        return {
+            "training_data": full_dataset,
+            "training_labels": features["labels"],
+            "feature_names": feature_names,
+        }
+
+    def save_classifier(self) -> None:
+        """Persist the classifier under its behavior-scoped pickle name."""
+        self._project.save_classifier(self._classifier, self._behavior)
+
+    def build_report_data(
+        self,
+        features: dict,
+        cv_results: list[CrossValidationResult],
+        final_top_features: list[tuple[str, float]],
+        elapsed_ms: int,
+        timestamp: datetime,
+        cv_grouping_strategy: CrossValidationGroupingStrategy,
+        distance_unit: str,
+        settings: dict,
+    ) -> TrainingReportData:
+        """Build the binary-mode training report with frame and bout counts."""
+        behavior_count = int(np.sum(features["labels"] == 1))
+        not_behavior_count = int(np.sum(features["labels"] == 0))
+        behavior_bouts, not_behavior_bouts = self._bout_counts
+        return TrainingReportData(
+            behavior_name=self._behavior,
+            classifier_type=self._classifier.classifier_name,
+            balance_training_labels=settings.get("balance_labels", False),
+            symmetric_behavior=settings.get("symmetric_behavior", False),
+            distance_unit=distance_unit,
+            cv_results=cv_results,
+            final_top_features=final_top_features,
+            frames_behavior=behavior_count,
+            frames_not_behavior=not_behavior_count,
+            bouts_behavior=behavior_bouts,
+            bouts_not_behavior=not_behavior_bouts,
+            training_time_ms=elapsed_ms,
+            timestamp=timestamp,
+            window_size=settings["window_size"],
+            cv_grouping_strategy=cv_grouping_strategy,
+        )
+
+    def cv_secondary_metric(self, cv_results: list[CrossValidationResult]) -> float | None:
+        """Mean f1 for the behavior class across CV folds."""
+        if not cv_results:
+            return None
+        return float(np.mean([cv.f1_behavior for cv in cv_results]))
+
+
+class MultiClassTrainingStrategy(TrainingStrategy):
+    """Training pipeline for the multi-class behavior classifier."""
+
+    def __init__(
+        self,
+        classifier: MultiClassClassifier,
+        project: Project,
+        behavior: str,
+    ) -> None:
+        super().__init__(classifier, project, behavior)
+        # Resolve effective settings once so feature extraction and the final
+        # training call see identical window/balance parameters even if the
+        # classifier mutates its own ``project_settings`` during training.
+        self._settings = classifier.project_settings or project.get_project_defaults()
+
+    def collect_features(
+        self,
+        progress_callable: Callable[[], None],
+        should_terminate_callable: Callable[[], None],
+    ) -> tuple[dict, dict]:
+        """Collect labeled features across every behavior in one pass."""
+        return self._project.get_multiclass_labeled_features(
+            progress_callable=progress_callable,
+            should_terminate_callable=should_terminate_callable,
+            behavior_settings=self._settings,
+        )
+
+    def effective_settings(self) -> dict:
+        """Return the captured project settings used at construction time."""
+        return self._settings
+
+    def final_train_data(
+        self,
+        features: dict,
+        full_dataset: pd.DataFrame,
+        feature_names: list[str],
+    ) -> dict:
+        """Build the multi-class ``classifier.train`` payload (per-behavior labels)."""
+        return {
+            "per_frame": features["per_frame"],
+            "window": features["window"],
+            "labels_by_behavior": features["labels_by_behavior"],
+            "settings": self._settings,
+            "feature_names": feature_names,
+        }
+
+    def save_classifier(self) -> None:
+        """Persist the classifier under the shared multi-class pickle name."""
+        self._project.save_classifier(self._classifier)
+
+    def build_report_data(
+        self,
+        features: dict,
+        cv_results: list[CrossValidationResult],
+        final_top_features: list[tuple[str, float]],
+        elapsed_ms: int,
+        timestamp: datetime,
+        cv_grouping_strategy: CrossValidationGroupingStrategy,
+        distance_unit: str,
+        settings: dict,
+    ) -> TrainingReportData:
+        """Build the multi-class training report with per-class frame and bout counts."""
+        class_names = self._classifier.get_class_names()
+        behavior_names = self._classifier.behavior_names
+
+        merged_labels, _ = classifier_utils.merge_labels(
+            features["labels_by_behavior"], behavior_names
+        )
+        class_frame_counts = {
+            name: int(np.sum(merged_labels == class_idx))
+            for class_idx, name in enumerate(class_names)
+        }
+        class_bout_counts: dict[str, int] = {}
+        for class_name in class_names:
+            bouts = 0
+            for video_counts in self._project.counts(class_name).values():
+                for identity_counts in video_counts.values():
+                    bouts += identity_counts["unfragmented_bout_counts"][0]
+            class_bout_counts[class_name] = bouts
+
+        return TrainingReportData(
+            behavior_name=self._behavior,
+            classifier_type=self._classifier.classifier_name,
+            balance_training_labels=settings.get("balance_labels", False),
+            symmetric_behavior=settings.get("symmetric_behavior", False),
+            distance_unit=distance_unit,
+            cv_results=cv_results,
+            final_top_features=final_top_features,
+            training_time_ms=elapsed_ms,
+            timestamp=timestamp,
+            window_size=settings.get("window_size", 0),
+            cv_grouping_strategy=cv_grouping_strategy,
+            class_frame_counts=class_frame_counts,
+            class_bout_counts=class_bout_counts,
+        )
+
+    def cv_secondary_metric(self, cv_results: list[CrossValidationResult]) -> float | None:
+        """Mean macro F1 across CV folds, or ``None`` if no folds reported it."""
+        if not cv_results:
+            return None
+        f1_macro = [cv.f1_macro for cv in cv_results if getattr(cv, "f1_macro", None) is not None]
+        return float(np.mean(f1_macro)) if f1_macro else None
diff --git a/src/jabs/ui/training_thread.py b/src/jabs/ui/training_thread.py
index 6a634008..7f40e32a 100644
--- a/src/jabs/ui/training_thread.py
+++ b/src/jabs/ui/training_thread.py
@@ -8,16 +8,20 @@
 from jabs.classifier import (
     Classifier,
     MultiClassClassifier,
-    TrainingReportData,
     generate_markdown_report,
     save_training_report,
 )
 from jabs.classifier.cross_validation import run_leave_one_group_out_cv
-from jabs.core.constants import FINAL_TRAIN_SEED, MULTICLASS_NONE_BEHAVIOR
+from jabs.core.constants import FINAL_TRAIN_SEED
 from jabs.core.enums import ClassifierMode, ProjectDistanceUnit
 from jabs.project import Project
 
 from .exceptions import ThreadTerminatedError
+from .training_strategy import (
+    BinaryTrainingStrategy,
+    MultiClassTrainingStrategy,
+    TrainingStrategy,
+)
 
 
 class TrainingThread(QThread):
@@ -81,12 +85,27 @@ def request_termination(self) -> None:
         """
         self._should_terminate = True
 
+    def _build_strategy(self) -> TrainingStrategy:
+        """Construct the per-mode training strategy for this run."""
+        if self._project.settings_manager.classifier_mode == ClassifierMode.MULTICLASS:
+            return MultiClassTrainingStrategy(
+                classifier=self._classifier,
+                project=self._project,
+                behavior=self._behavior,
+            )
+        return BinaryTrainingStrategy(
+            classifier=self._classifier,
+            project=self._project,
+            behavior=self._behavior,
+            bout_counts=self._bout_counts,
+        )
+
     def run(self) -> None:
-        """thread's main function
+        """Thread's main function.
 
-        Will get the feature set for all labeled frames, do the leave one group out train/test split,
-        run the training, run the trained classifier on the test data, collect performance metrics,
-        and generate a training report (saved as markdown and emitted as HTML).
+        Get the feature set for all labeled frames, run leave-one-group-out
+        cross-validation, train the final classifier on all data, save the
+        classifier and training report, and emit progress/completion signals.
         """
         t0_ns = time.perf_counter_ns()
         tasks_complete = 0
@@ -102,225 +121,80 @@ def id_processed() -> None:
             check_termination_requested()
 
         try:
-            self.current_status.emit("Extracting Features")
-            if self._project.settings_manager.classifier_mode == ClassifierMode.MULTICLASS:
-                # Resolve effective settings before feature extraction so that window
-                # features are always built with the same settings the classifier trains on.
-                multiclass_settings = (
-                    self._classifier.project_settings or self._project.get_project_defaults()
-                )
-
-                # Multi-class training uses one shared labeled-feature set across all behaviors.
-                features, group_mapping = self._project.get_multiclass_labeled_features(
-                    progress_callable=id_processed,
-                    should_terminate_callable=check_termination_requested,
-                    behavior_settings=multiclass_settings,
-                )
-                check_termination_requested()
-
-                cv_results = run_leave_one_group_out_cv(
-                    classifier=self._classifier,
-                    project=self._project,
-                    features=features,
-                    group_mapping=group_mapping,
-                    behavior=self._behavior,
-                    k=self._k,
-                    status_callback=self.current_status.emit,
-                    progress_callback=id_processed,
-                    terminate_callback=check_termination_requested,
-                )
+            strategy = self._build_strategy()
+            settings = strategy.effective_settings()
 
-                self.current_status.emit("Training Classifier")
-                full_dataset = self._classifier.combine_data(
-                    features["per_frame"], features["window"]
-                )
-                feature_names = full_dataset.columns.to_list()
-                self._classifier.train(
-                    {
-                        "per_frame": features["per_frame"],
-                        "window": features["window"],
-                        "labels_by_behavior": features["labels_by_behavior"],
-                        "settings": multiclass_settings,
-                        "feature_names": feature_names,
-                    },
-                    random_seed=FINAL_TRAIN_SEED,
-                )
-                final_top_features = self._classifier.get_feature_importance(limit=20)
-                self._project.save_classifier(self._classifier)
-
-                elapsed_ms = int((time.perf_counter_ns() - t0_ns) // 1_000_000)
-                behavior_names = list(getattr(self._classifier, "behavior_names", []))
-                if not behavior_names:
-                    behavior_names = [
-                        name
-                        for name in features["labels_by_behavior"]
-                        if name != MULTICLASS_NONE_BEHAVIOR
-                    ]
-
-                merged_labels, _ = MultiClassClassifier.merge_labels(
-                    features["labels_by_behavior"],
-                    behavior_names,
-                )
-                class_names = self._classifier.get_class_names()
-                class_frame_counts = {
-                    name: int(np.sum(merged_labels == class_idx))
-                    for class_idx, name in enumerate(class_names)
-                }
-                class_bout_counts: dict[str, int] = {}
-                for class_name in class_names:
-                    bouts = 0
-                    for video_counts in self._project.counts(class_name).values():
-                        for identity_counts in video_counts.values():
-                            bouts += identity_counts["unfragmented_bout_counts"][0]
-                    class_bout_counts[class_name] = bouts
+            self.current_status.emit("Extracting Features")
+            features, group_mapping = strategy.collect_features(
+                progress_callable=id_processed,
+                should_terminate_callable=check_termination_requested,
+            )
+            check_termination_requested()
 
-                unit = (
-                    "cm"
-                    if self._project.feature_manager.distance_unit == ProjectDistanceUnit.CM
-                    else "pixel"
-                )
-                report_timestamp = datetime.now()
-                training_data = TrainingReportData(
-                    behavior_name=self._behavior,
-                    classifier_type=self._classifier.classifier_name,
-                    balance_training_labels=multiclass_settings.get("balance_labels", False),
-                    symmetric_behavior=multiclass_settings.get("symmetric_behavior", False),
-                    distance_unit=unit,
-                    cv_results=cv_results,
-                    final_top_features=final_top_features,
-                    training_time_ms=elapsed_ms,
-                    timestamp=report_timestamp,
-                    window_size=multiclass_settings.get("window_size", 0),
-                    cv_grouping_strategy=self._project.settings_manager.cv_grouping_strategy,
-                    class_frame_counts=class_frame_counts,
-                    class_bout_counts=class_bout_counts,
+            cv_results = run_leave_one_group_out_cv(
+                classifier=self._classifier,
+                project=self._project,
+                features=features,
+                group_mapping=group_mapping,
+                behavior=self._behavior,
+                k=self._k,
+                status_callback=self.current_status.emit,
+                progress_callback=id_processed,
+                terminate_callback=check_termination_requested,
+            )
+
+            self.current_status.emit("Training Classifier")
+            full_dataset = self._classifier.combine_data(features["per_frame"], features["window"])
+            feature_names = full_dataset.columns.to_list()
+            self._classifier.train(
+                strategy.final_train_data(features, full_dataset, feature_names),
+                random_seed=FINAL_TRAIN_SEED,
+            )
+            final_top_features = self._classifier.get_feature_importance(limit=20)
+            strategy.save_classifier()
+
+            elapsed_ms = int((time.perf_counter_ns() - t0_ns) // 1_000_000)
+            unit = (
+                "cm"
+                if self._project.feature_manager.distance_unit == ProjectDistanceUnit.CM
+                else "pixel"
+            )
+            training_data = strategy.build_report_data(
+                features=features,
+                cv_results=cv_results,
+                final_top_features=final_top_features,
+                elapsed_ms=elapsed_ms,
+                timestamp=datetime.now(),
+                cv_grouping_strategy=self._project.settings_manager.cv_grouping_strategy,
+                distance_unit=unit,
+                settings=settings,
+            )
+
+            timestamp_str = training_data.timestamp.strftime("%Y%m%d_%H%M%S")
+            report_filename = f"{self._behavior}_{timestamp_str}_training_report.md"
+            report_path = self._training_log_dir / report_filename
+            save_training_report(training_data, report_path)
+
+            markdown_content = generate_markdown_report(training_data)
+            self.training_report.emit(markdown_content)
+
+            if self._k > 0 and training_data.cv_results:
+                accuracies = [cv.accuracy for cv in training_data.cv_results]
+                self._project.session_tracker.classifier_trained(
+                    self._behavior,
+                    self._classifier.classifier_name,
+                    len(training_data.cv_results),
+                    float(np.mean(accuracies)),
+                    strategy.cv_secondary_metric(training_data.cv_results),
                 )
-
-                timestamp_str = training_data.timestamp.strftime("%Y%m%d_%H%M%S")
-                report_filename = f"{self._behavior}_{timestamp_str}_training_report.md"
-                report_path = self._training_log_dir / report_filename
-                save_training_report(training_data, report_path)
-
-                markdown_content = generate_markdown_report(training_data)
-                self.training_report.emit(markdown_content)
-
-                if self._k > 0 and training_data.cv_results:
-                    accuracies = [cv.accuracy for cv in training_data.cv_results]
-                    f1_macro = [
-                        cv.f1_macro for cv in training_data.cv_results if cv.f1_macro is not None
-                    ]
-                    mean_f1_macro = float(np.mean(f1_macro)) if f1_macro else None
-                    self._project.session_tracker.classifier_trained(
-                        self._behavior,
-                        self._classifier.classifier_name,
-                        len(training_data.cv_results),
-                        float(np.mean(accuracies)),
-                        mean_f1_macro,
-                    )
-                else:
-                    self._project.session_tracker.classifier_trained(
-                        self._behavior,
-                        self._classifier.classifier_name,
-                        0,
-                    )
-                self.update_progress.emit(tasks_complete + 1)
-                self.training_complete.emit(elapsed_ms)
             else:
-                features, group_mapping = self._project.get_labeled_features(
+                self._project.session_tracker.classifier_trained(
                     self._behavior,
-                    progress_callable=id_processed,
-                    should_terminate_callable=check_termination_requested,
-                )
-                check_termination_requested()
-
-                # Binary mode keeps the existing LOGO CV + markdown report path.
-                cv_results = run_leave_one_group_out_cv(
-                    classifier=self._classifier,
-                    project=self._project,
-                    features=features,
-                    group_mapping=group_mapping,
-                    behavior=self._behavior,
-                    k=self._k,
-                    status_callback=self.current_status.emit,
-                    progress_callback=id_processed,
-                    terminate_callback=check_termination_requested,
-                )
-
-                # Final training on all data
-                full_dataset = self._classifier.combine_data(
-                    features["per_frame"], features["window"]
+                    self._classifier.classifier_name,
+                    0,
                 )
-                feature_names = full_dataset.columns.to_list()
-                self._classifier.train(
-                    {
-                        "training_data": full_dataset,
-                        "training_labels": features["labels"],
-                        "feature_names": feature_names,
-                    },
-                    random_seed=FINAL_TRAIN_SEED,
-                )
-                final_top_features = self._classifier.get_feature_importance(limit=20)
-                self._project.save_classifier(self._classifier, self._behavior)
-
-                # Prepare training report
-                elapsed_ms = int((time.perf_counter_ns() - t0_ns) // 1_000_000)
-                behavior_count = int(np.sum(features["labels"] == 1))
-                not_behavior_count = int(np.sum(features["labels"] == 0))
-                behavior_bouts, not_behavior_bouts = self._bout_counts
-                unit = (
-                    "cm"
-                    if self._project.feature_manager.distance_unit == ProjectDistanceUnit.CM
-                    else "pixel"
-                )
-                report_timestamp = datetime.now()
-                behavior_settings = self._project.settings_manager.get_behavior(self._behavior)
-                training_data = TrainingReportData(
-                    behavior_name=self._behavior,
-                    classifier_type=self._classifier.classifier_name,
-                    balance_training_labels=behavior_settings.get("balance_labels", False),
-                    symmetric_behavior=behavior_settings.get("symmetric_behavior", False),
-                    distance_unit=unit,
-                    cv_results=cv_results,
-                    final_top_features=final_top_features,
-                    frames_behavior=behavior_count,
-                    frames_not_behavior=not_behavior_count,
-                    bouts_behavior=behavior_bouts,
-                    bouts_not_behavior=not_behavior_bouts,
-                    training_time_ms=elapsed_ms,
-                    timestamp=report_timestamp,
-                    window_size=behavior_settings["window_size"],
-                    cv_grouping_strategy=self._project.settings_manager.cv_grouping_strategy,
-                )
-
-                # Save markdown report
-                timestamp_str = training_data.timestamp.strftime("%Y%m%d_%H%M%S")
-                report_filename = f"{self._behavior}_{timestamp_str}_training_report.md"
-                report_path = self._training_log_dir / report_filename
-                save_training_report(training_data, report_path)
-
-                # Generate and emit markdown report
-                markdown_content = generate_markdown_report(training_data)
-                self.training_report.emit(markdown_content)
-
-                # Update session tracker
-                if self._k > 0 and training_data.cv_results:
-                    accuracies = [cv.accuracy for cv in training_data.cv_results]
-                    fbeta_behavior = [cv.f1_behavior for cv in training_data.cv_results]
-                    self._project.session_tracker.classifier_trained(
-                        self._behavior,
-                        self._classifier.classifier_name,
-                        len(training_data.cv_results),
-                        float(np.mean(accuracies)),
-                        float(np.mean(fbeta_behavior)),
-                    )
-                else:
-                    self._project.session_tracker.classifier_trained(
-                        self._behavior,
-                        self._classifier.classifier_name,
-                        0,
-                    )
 
-                self.update_progress.emit(tasks_complete + 1)
-                self.training_complete.emit(training_data.training_time_ms)
+            self.update_progress.emit(tasks_complete + 1)
+            self.training_complete.emit(elapsed_ms)
         except Exception as e:
             self.error_callback.emit(e)
diff --git a/tests/classifier/test_classifier.py b/tests/classifier/test_classifier.py
index 82e16a20..524ac4e8 100644
--- a/tests/classifier/test_classifier.py
+++ b/tests/classifier/test_classifier.py
@@ -486,11 +486,7 @@ class TestClassifierSaveLoad:
     """Test classifier save and load functionality."""
 
     def test_save_and_load(self, sample_features, sample_labels, mock_project, tmp_path):
-        """Test saving and loading a trained classifier.
-
-        Note: Currently, feature_names is not preserved during save/load.
-        If this is changed, we should check that after loading, feature_names match.
-        """
+        """Test saving and loading a trained classifier."""
         # Train a classifier
         clf = Classifier()
         clf.behavior_name = "Grooming"
@@ -517,6 +513,11 @@ def test_save_and_load(self, sample_features, sample_labels, mock_project, tmp_p
 
         assert clf2.behavior_name == "Grooming"
         assert clf2.classifier_type == clf.classifier_type
+        assert clf2.feature_names == clf.feature_names
+        # feature_names round-tripping is what makes get_feature_importance work
+        # after a load - guard against the regression by checking the report is
+        # non-empty on the loaded instance.
+        assert clf2.get_feature_importance(limit=5)
 
         # Predictions should still work and match
         pred1 = clf.predict(sample_features)
@@ -532,7 +533,7 @@ def test_load_invalid_file_raises_error(self, tmp_path):
         joblib.dump({"not": "a classifier"}, invalid_file)
 
         clf = Classifier()
-        with pytest.raises(ValueError, match="not instance of Classifier"):
+        with pytest.raises(ValueError, match="not an instance of Classifier"):
             clf.load(invalid_file)
 
 
diff --git a/tests/classifier/test_multi_class_classifier.py b/tests/classifier/test_multi_class_classifier.py
index 24387d68..48ad0151 100644
--- a/tests/classifier/test_multi_class_classifier.py
+++ b/tests/classifier/test_multi_class_classifier.py
@@ -4,9 +4,10 @@
 import pandas as pd
 import pytest
 
+from jabs.classifier import classifier_utils
 from jabs.classifier.multi_class_classifier import MultiClassClassifier
 from jabs.core.constants import MULTICLASS_NONE_BEHAVIOR
-from jabs.core.enums import ClassifierType
+from jabs.core.enums import ClassifierType, CrossValidationGroupingStrategy
 from jabs.project import TrackLabels
 
 try:
@@ -98,11 +99,11 @@ def combined_features(synthetic_features) -> pd.DataFrame:
 
 
 class TestMergeLabels:
-    """Tests for MultiClassClassifier.merge_labels."""
+    """Tests for classifier_utils.merge_labels (multi-class label merging)."""
 
     def test_behavior_frames_map_to_correct_class_index(self, two_behavior_labels):
         """Frames labeled BEHAVIOR for each behavior map to class 1 and 2."""
-        labels, mask = MultiClassClassifier.merge_labels(two_behavior_labels, BEHAVIOR_NAMES)
+        labels, mask = classifier_utils.merge_labels(two_behavior_labels, BEHAVIOR_NAMES)
 
         # 9 frames included: 3 running (class 1) + 3 grooming (class 2) + 3 none (class 0)
         assert mask.sum() == 9
@@ -112,7 +113,7 @@ def test_behavior_frames_map_to_correct_class_index(self, two_behavior_labels):
 
     def test_none_label_frames_excluded(self, two_behavior_labels):
         """Frames with NONE label on all behaviors are excluded from training."""
-        _, mask = MultiClassClassifier.merge_labels(two_behavior_labels, BEHAVIOR_NAMES)
+        _, mask = classifier_utils.merge_labels(two_behavior_labels, BEHAVIOR_NAMES)
 
         # frames 9-11 are unlabeled → excluded
         assert not mask[9]
@@ -124,7 +125,7 @@ def test_not_behavior_frames_excluded(self):
         labels_by_behavior = {
             "walking": np.array([_B, _B, _N, _N, _X, _X], dtype=np.int8),
         }
-        _, mask = MultiClassClassifier.merge_labels(labels_by_behavior, ["walking"])
+        _, mask = classifier_utils.merge_labels(labels_by_behavior, ["walking"])
 
         # only the first two BEHAVIOR frames are included
         assert mask.sum() == 2
@@ -137,7 +138,7 @@ def test_none_behavior_maps_to_class_zero(self):
             "walking": np.array([_B, _B, _X, _X], dtype=np.int8),
             MULTICLASS_NONE_BEHAVIOR: np.array([_X, _X, _B, _B], dtype=np.int8),
         }
-        labels, mask = MultiClassClassifier.merge_labels(labels_by_behavior, ["walking"])
+        labels, mask = classifier_utils.merge_labels(labels_by_behavior, ["walking"])
 
         assert mask.sum() == 4
         np.testing.assert_array_equal(labels, [1, 1, 0, 0])
@@ -150,7 +151,7 @@ def test_behavior_ordering_respected(self):
             "b": np.array([_X, _X, _X, _X, _B, _X], dtype=np.int8),
         }
         behavior_names = ["a", "b", "c"]  # alphabetical, class 1/2/3
-        labels, _ = MultiClassClassifier.merge_labels(labels_by_behavior, behavior_names)
+        labels, _ = classifier_utils.merge_labels(labels_by_behavior, behavior_names)
 
         # "a" → class 1, "b" → class 2, "c" → class 3
         assert 3 in labels  # "c" at frame 0
@@ -160,7 +161,7 @@ def test_behavior_ordering_respected(self):
     def test_empty_labels_by_behavior_raises(self):
         """Empty labels_by_behavior raises ValueError."""
         with pytest.raises(ValueError, match="must not be empty"):
-            MultiClassClassifier.merge_labels({}, ["running"])
+            classifier_utils.merge_labels({}, ["running"])
 
     def test_conflicting_behavior_labels_raises(self):
         """Frames with BEHAVIOR in more than one behavior raise ValueError."""
@@ -169,7 +170,7 @@ def test_conflicting_behavior_labels_raises(self):
             "grooming": np.array([_B, _B, _X, _X], dtype=np.int8),
         }
         with pytest.raises(ValueError, match="Conflicting BEHAVIOR labels"):
-            MultiClassClassifier.merge_labels(labels_by_behavior, ["running", "grooming"])
+            classifier_utils.merge_labels(labels_by_behavior, ["running", "grooming"])
 
     def test_missing_behavior_in_dict_is_skipped(self):
         """Behavior names not present in labels_by_behavior are silently skipped."""
@@ -177,9 +178,7 @@ def test_missing_behavior_in_dict_is_skipped(self):
             "running": np.array([_B, _B, _X, _X], dtype=np.int8),
             # "grooming" intentionally absent
         }
-        labels, mask = MultiClassClassifier.merge_labels(
-            labels_by_behavior, ["running", "grooming"]
-        )
+        labels, mask = classifier_utils.merge_labels(labels_by_behavior, ["running", "grooming"])
         assert mask.sum() == 2
         np.testing.assert_array_equal(labels, [1, 1])
 
@@ -260,13 +259,6 @@ def test_classifier_metadata_properties_defaults(self):
         assert clf.classifier_file is None
         assert clf.classifier_hash is None
         assert clf.project_settings == {}
-        assert clf.behavior_name is None
-
-    def test_behavior_name_property(self):
-        """behavior_name getter/setter round-trips values."""
-        clf = MultiClassClassifier(BEHAVIOR_NAMES)
-        clf.behavior_name = "Running"
-        assert clf.behavior_name == "Running"
 
     def test_set_dict_settings_copies(self):
         """set_dict_settings stores and returns a defensive copy."""
@@ -597,6 +589,131 @@ def test_get_leave_one_group_out_max_zero_when_training_incomplete(self):
         assert MultiClassClassifier.get_leave_one_group_out_max(labels, groups) == 0
 
 
+# ---------------------------------------------------------------------------
+# count_label_threshold / label_threshold_met
+# ---------------------------------------------------------------------------
+
+
+class TestLabelThreshold:
+    """Tests for MultiClassClassifier.count_label_threshold and label_threshold_met.
+
+    Test counts use 20 frames, matching ``classifier_utils.LABEL_THRESHOLD``.
+    """
+
+    def test_count_label_threshold_individual(self) -> None:
+        """Valid-split counting matches multiclass LOGO constraints for per-identity groups."""
+        counts_by_behavior = {
+            "None": {
+                "video_a.avi": {
+                    0: {"fragmented_frame_counts": (20, 0)},
+                    1: {"fragmented_frame_counts": (20, 0)},
+                    2: {"fragmented_frame_counts": (20, 0)},
+                }
+            },
+            "Walk": {
+                "video_a.avi": {
+                    0: {"fragmented_frame_counts": (20, 0)},
+                    1: {"fragmented_frame_counts": (20, 0)},
+                    2: {"fragmented_frame_counts": (20, 0)},
+                }
+            },
+            "Run": {
+                "video_a.avi": {
+                    0: {"fragmented_frame_counts": (0, 0)},
+                    1: {"fragmented_frame_counts": (20, 0)},
+                    2: {"fragmented_frame_counts": (20, 0)},
+                }
+            },
+        }
+
+        valid = MultiClassClassifier.count_label_threshold(
+            counts_by_behavior=counts_by_behavior,
+            behavior_names=["None", "Walk", "Run"],
+            cv_grouping_strategy=CrossValidationGroupingStrategy.INDIVIDUAL,
+        )
+
+        assert valid == 3
+
+    def test_count_label_threshold_video_grouping(self) -> None:
+        """Video grouping aggregates identities per video before validity checks."""
+        counts_by_behavior = {
+            "None": {
+                "video_a.avi": {0: {"fragmented_frame_counts": (20, 0)}},
+                "video_b.avi": {0: {"fragmented_frame_counts": (20, 0)}},
+            },
+            "Walk": {
+                "video_a.avi": {0: {"fragmented_frame_counts": (20, 0)}},
+                "video_b.avi": {0: {"fragmented_frame_counts": (20, 0)}},
+            },
+            "Run": {
+                "video_a.avi": {0: {"fragmented_frame_counts": (20, 0)}},
+                "video_b.avi": {0: {"fragmented_frame_counts": (0, 0)}},
+            },
+        }
+
+        valid = MultiClassClassifier.count_label_threshold(
+            counts_by_behavior=counts_by_behavior,
+            behavior_names=["None", "Walk", "Run"],
+            cv_grouping_strategy=CrossValidationGroupingStrategy.VIDEO,
+        )
+
+        assert valid == 1
+
+    def test_count_label_threshold_empty_behavior_names_returns_zero(self) -> None:
+        """No behaviors → no valid splits."""
+        assert (
+            MultiClassClassifier.count_label_threshold(
+                counts_by_behavior={},
+                behavior_names=[],
+            )
+            == 0
+        )
+
+    def test_label_threshold_met_requires_at_least_two_behaviors(self) -> None:
+        """Returns False when fewer than two class names are supplied."""
+        assert not MultiClassClassifier.label_threshold_met(
+            counts_by_behavior={"None": {}},
+            behavior_names=["None"],
+            min_groups=1,
+        )
+
+    def test_label_threshold_met_true_when_min_groups_satisfied(self) -> None:
+        """Returns True when valid-split count meets ``min_groups``."""
+        counts_by_behavior = {
+            "None": {
+                "video_a.avi": {
+                    0: {"fragmented_frame_counts": (20, 0)},
+                    1: {"fragmented_frame_counts": (20, 0)},
+                }
+            },
+            "Walk": {
+                "video_a.avi": {
+                    0: {"fragmented_frame_counts": (20, 0)},
+                    1: {"fragmented_frame_counts": (20, 0)},
+                }
+            },
+        }
+        assert MultiClassClassifier.label_threshold_met(
+            counts_by_behavior=counts_by_behavior,
+            behavior_names=["None", "Walk"],
+            min_groups=2,
+            cv_grouping_strategy=CrossValidationGroupingStrategy.INDIVIDUAL,
+        )
+
+    def test_label_threshold_met_false_when_below_min_groups(self) -> None:
+        """Returns False when ``min_groups`` exceeds the valid-split count."""
+        counts_by_behavior = {
+            "None": {"video_a.avi": {0: {"fragmented_frame_counts": (20, 0)}}},
+            "Walk": {"video_a.avi": {0: {"fragmented_frame_counts": (20, 0)}}},
+        }
+        assert not MultiClassClassifier.label_threshold_met(
+            counts_by_behavior=counts_by_behavior,
+            behavior_names=["None", "Walk"],
+            min_groups=5,
+            cv_grouping_strategy=CrossValidationGroupingStrategy.INDIVIDUAL,
+        )
+
+
 # ---------------------------------------------------------------------------
 # Protocol compliance
 # ---------------------------------------------------------------------------
diff --git a/tests/classifier/test_training_report.py b/tests/classifier/test_training_report.py
index f5440e75..85dfe050 100644
--- a/tests/classifier/test_training_report.py
+++ b/tests/classifier/test_training_report.py
@@ -6,7 +6,8 @@
 import pytest
 
 from jabs.classifier.training_report import (
-    CrossValidationResult,
+    BinaryCVResult,
+    MultiClassCVResult,
     TrainingReportData,
     generate_json_report,
     generate_markdown_report,
@@ -23,7 +24,7 @@ def sample_cv_results():
         List of CrossValidationResult objects.
     """
     return [
-        CrossValidationResult(
+        BinaryCVResult(
             iteration=1,
             test_label="video_1.mp4 [0]",
             accuracy=0.9234,
@@ -37,7 +38,7 @@ def sample_cv_results():
             confusion_matrix=np.array([[180, 20], [15, 135]]),
             top_features=[("nose_speed", 0.16), ("ear_angle", 0.14)],
         ),
-        CrossValidationResult(
+        BinaryCVResult(
             iteration=2,
             test_label="video_2.mp4 [1]",
             accuracy=0.8912,
@@ -87,11 +88,11 @@ def sample_training_data(sample_cv_results):
 
 
 class TestCrossValidationResult:
-    """Tests for CrossValidationResult dataclass."""
+    """Tests for BinaryCVResult dataclass."""
 
     def test_create_cv_result(self):
-        """Test creating a CrossValidationResult instance."""
-        result = CrossValidationResult(
+        """Test creating a BinaryCVResult instance."""
+        result = BinaryCVResult(
             iteration=1,
             test_label="test.mp4 [0]",
             accuracy=0.95,
@@ -360,7 +361,7 @@ class TestMulticlassReport:
     def test_multiclass_markdown_contains_multiclass_metrics(self):
         """Markdown report uses multiclass summary/table and class counts."""
         cv_results = [
-            CrossValidationResult(
+            MultiClassCVResult(
                 iteration=1,
                 test_label="video_a.mp4 [0]",
                 accuracy=0.82,
@@ -402,7 +403,7 @@ def test_multiclass_markdown_contains_multiclass_metrics(self):
     def test_multiclass_json_contains_optional_metrics(self):
         """JSON report serializes multiclass-only CV and class-count fields."""
         cv_results = [
-            CrossValidationResult(
+            MultiClassCVResult(
                 iteration=1,
                 test_label="video_a.mp4",
                 accuracy=0.9,
diff --git a/tests/project/test_parallel_workers.py b/tests/project/test_parallel_workers.py
index 438f85bd..41ab6a6b 100644
--- a/tests/project/test_parallel_workers.py
+++ b/tests/project/test_parallel_workers.py
@@ -8,12 +8,12 @@
 import pytest
 
 from jabs.core.constants import MULTICLASS_NONE_BEHAVIOR
-from jabs.core.enums import CacheFormat, ClassifierMode
+from jabs.core.enums import CacheFormat
 from jabs.project.parallel_workers import (
     VideoScanJobSpec,
     VideoScanResult,
     _get_identity_count,
-    collect_labeled_features,
+    collect_multiclass_labeled_features,
     scan_video_metadata,
 )
 from jabs.project.track_labels import TrackLabels
@@ -68,7 +68,7 @@ def merge_window_features(window_features: dict[str, np.ndarray]) -> dict[str, n
     monkeypatch.setattr("jabs.project.parallel_workers._load_video_labels", lambda *_: labels)
     monkeypatch.setattr("jabs.project.parallel_workers.fe.IdentityFeatures", _FakeIdentityFeatures)
 
-    result = collect_labeled_features(
+    result = collect_multiclass_labeled_features(
         {
             "video": "video.avi",
             "video_path": tmp_path / "video.avi",
@@ -77,9 +77,7 @@ def merge_window_features(window_features: dict[str, np.ndarray]) -> dict[str, n
             "feature_dir": tmp_path / "features",
             "cache_dir": tmp_path / "cache",
             "behavior_settings": {"window_size": 3},
-            "behavior_name": None,
             "behavior_names": ["Walk", "Run"],
-            "classifier_mode": ClassifierMode.MULTICLASS.value,
             "cache_format": CacheFormat.HDF5.value,
         }
     )
@@ -140,7 +138,7 @@ def test_collect_labeled_features_multiclass_requires_behavior_names(
     monkeypatch.setattr("jabs.project.parallel_workers._load_video_labels", lambda *_: labels)
 
     with pytest.raises(ValueError, match="behavior_names is required"):
-        collect_labeled_features(
+        collect_multiclass_labeled_features(
             {
                 "video": "video.avi",
                 "video_path": tmp_path / "video.avi",
@@ -149,9 +147,7 @@ def test_collect_labeled_features_multiclass_requires_behavior_names(
                 "feature_dir": tmp_path / "features",
                 "cache_dir": tmp_path / "cache",
                 "behavior_settings": {"window_size": 3},
-                "behavior_name": None,
                 "behavior_names": None,
-                "classifier_mode": ClassifierMode.MULTICLASS.value,
                 "cache_format": CacheFormat.HDF5.value,
             }
         )
diff --git a/tests/project/test_project.py b/tests/project/test_project.py
index 83f7d1dd..1f87002d 100644
--- a/tests/project/test_project.py
+++ b/tests/project/test_project.py
@@ -492,7 +492,7 @@ def test_overlapping_labels_conflict_detected(tmp_path: Path) -> None:
 def test_overlapping_labels_none_behavior_is_a_conflict(tmp_path: Path) -> None:
     """A behavior and the None behavior sharing a BEHAVIOR-labeled frame is a conflict.
 
-    This keeps the validator consistent with MultiClassClassifier.merge_labels(),
+    This keeps the validator consistent with classifier_utils.merge_labels(),
     which raises ValueError for the same condition at training time.
     """
     labels = VideoLabels("video1.avi", 100)
@@ -581,7 +581,7 @@ def _fake_collect(job: dict) -> dict:
             "group_keys": [("video_b.avi", 1)],
         }
 
-    monkeypatch.setattr("jabs.project.project.collect_labeled_features", _fake_collect)
+    monkeypatch.setattr("jabs.project.project.collect_multiclass_labeled_features", _fake_collect)
 
     progress_calls = {"count": 0}
     features, group_mapping = project.get_multiclass_labeled_features(
@@ -589,7 +589,6 @@ def _fake_collect(job: dict) -> dict:
     )
 
     assert progress_calls["count"] == 2
-    assert all(job["classifier_mode"] == ClassifierMode.MULTICLASS.value for job in jobs_seen)
     assert all(job["behavior_names"] == ["Walk", "Run"] for job in jobs_seen)
 
     assert features["per_frame"].shape[0] == 3
@@ -651,7 +650,7 @@ def _fake_collect(_job: dict) -> dict:
             "group_keys": [("video_a.avi", 0)],
         }
 
-    monkeypatch.setattr("jabs.project.project.collect_labeled_features", _fake_collect)
+    monkeypatch.setattr("jabs.project.project.collect_multiclass_labeled_features", _fake_collect)
 
     features, _ = project.get_multiclass_labeled_features()
 
diff --git a/tests/ui/__init__.py b/tests/ui/__init__.py
new file mode 100644
index 00000000..a330bead
--- /dev/null
+++ b/tests/ui/__init__.py
@@ -0,0 +1 @@
+"""UI test package; declared so test modules can share helpers via relative imports."""
diff --git a/tests/ui/_fakes.py b/tests/ui/_fakes.py
new file mode 100644
index 00000000..1c8bcb78
--- /dev/null
+++ b/tests/ui/_fakes.py
@@ -0,0 +1,275 @@
+"""Shared test doubles for UI thread tests (TrainingThread, ClassifyThread).
+
+The classifier and project test doubles diverge by which thread they support
+(training vs. classification), so they remain separate classes. Co-locating
+them here keeps test files focused on their assertions and provides a single
+spot to extend the fakes as more UI thread tests are added.
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+from typing import TYPE_CHECKING, ClassVar
+from unittest.mock import MagicMock
+
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+
+from jabs.core.constants import MULTICLASS_NONE_BEHAVIOR
+from jabs.core.enums import (
+    ClassifierMode,
+    CrossValidationGroupingStrategy,
+    ProjectDistanceUnit,
+)
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+
+# ---------------------------------------------------------------------------
+# Classifier fakes
+# ---------------------------------------------------------------------------
+
+
+class FakeTrainingClassifier:
+    """Minimal Classifier stand-in implementing the methods and attributes ``TrainingThread`` calls.
+
+    Records ``train(...)`` calls for assertion; ``get_feature_importance`` and
+    ``combine_data`` return deterministic placeholders.
+    """
+
+    def __init__(
+        self,
+        name: str = "random_forest",
+        project_settings: dict | None = None,
+    ) -> None:
+        self.classifier_name = name
+        self.project_settings = {} if project_settings is None else dict(project_settings)
+        self.behavior_names: list[str] = ["Walk", "Run"]
+        self.train_calls: list[dict] = []
+
+    @staticmethod
+    def combine_data(per_frame: pd.DataFrame, window: pd.DataFrame) -> pd.DataFrame:
+        """Concatenate per-frame and window features side-by-side."""
+        return pd.concat([per_frame, window], axis=1)
+
+    def train(self, data: dict, random_seed: int | None = None) -> None:
+        """Record the call payload and seed for later assertion."""
+        call = dict(data)
+        call["random_seed"] = random_seed
+        self.train_calls.append(call)
+
+    @staticmethod
+    def get_feature_importance(limit: int = 20) -> list[tuple[str, float]]:
+        """Return a placeholder feature-importance list capped at ``limit``."""
+        return [("feat_a", 1.0)][:limit]
+
+    def get_class_names(self) -> list[str]:
+        """Return ``[MULTICLASS_NONE_BEHAVIOR, *behavior_names]`` for multi-class reporting."""
+        return [MULTICLASS_NONE_BEHAVIOR, *self.behavior_names]
+
+
+class FakeClassifyingClassifier:
+    """Minimal Classifier stand-in implementing the methods and attributes ``ClassifyThread`` calls.
+
+    ``predict_proba`` returns a fixed probability matrix shaped by ``multiclass``
+    (binary: ``(n, 2)``; multi-class: ``(n, 3)``). ``derive_predictions`` argmax-es
+    that matrix and zeroes out frames with no pose data.
+    """
+
+    def __init__(self, multiclass: bool = False) -> None:
+        self._multiclass = multiclass
+        self.project_settings: dict = {"window_size": 7}
+        self.behavior_names: list[str] = ["Walk", "Run"]
+
+    @staticmethod
+    def combine_data(per_frame: pd.DataFrame, window: pd.DataFrame) -> pd.DataFrame:
+        """Concatenate per-frame and window features side-by-side."""
+        return pd.concat([per_frame, window], axis=1)
+
+    def predict_proba(
+        self,
+        data: pd.DataFrame,
+        frame_indexes: npt.NDArray[np.intp],
+    ) -> npt.NDArray[np.float32]:
+        """Return a fixed probability matrix; zeroes rows with frame_indexes == -1."""
+        n = len(data)
+        if self._multiclass:
+            probs = np.zeros((n, 3), dtype=np.float32)
+            probs[:, 0] = 0.1
+            probs[:, 1] = 0.7
+            probs[:, 2] = 0.2
+            probs[frame_indexes == -1] = 0.0
+            return probs
+
+        probs = np.zeros((n, 2), dtype=np.float32)
+        probs[:, 0] = 0.2
+        probs[:, 1] = 0.8
+        return probs
+
+    def derive_predictions(
+        self,
+        probabilities: npt.NDArray[np.float32],
+    ) -> tuple[npt.NDArray[np.int8], npt.NDArray[np.float32]]:
+        """Argmax-derived predictions and per-frame confidence, with -1 on empties."""
+        predictions = np.argmax(probabilities, axis=1).astype(np.int8)
+        confidence = probabilities[np.arange(len(probabilities)), predictions].astype(np.float32)
+        predictions[confidence == 0] = -1
+        return predictions, confidence
+
+    @staticmethod
+    def get_class_names() -> list[str]:
+        """Return the ordered class names for multi-class persistence."""
+        return [MULTICLASS_NONE_BEHAVIOR, "Walk", "Run"]
+
+
+# ---------------------------------------------------------------------------
+# Project fakes
+# ---------------------------------------------------------------------------
+
+
+class FakeTrainingProject:
+    """Minimal Project stand-in implementing the methods and attributes ``TrainingThread`` calls.
+
+    Counts calls to ``get_labeled_features`` / ``get_multiclass_labeled_features``
+    so tests can assert which mode path ran. ``save_classifier`` is a ``MagicMock``
+    so call args can be checked.
+    """
+
+    _DEFAULT_BEHAVIOR_SETTINGS: ClassVar[dict] = {
+        "window_size": 5,
+        "balance_labels": False,
+        "symmetric_behavior": False,
+    }
+
+    def __init__(
+        self,
+        tmp_path: Path,
+        mode: ClassifierMode,
+        binary_features: dict | None = None,
+        multiclass_features: dict | None = None,
+    ) -> None:
+        self.project_paths = SimpleNamespace(training_log_dir=tmp_path)
+        self.feature_manager = SimpleNamespace(distance_unit=ProjectDistanceUnit.PIXEL)
+        self.session_tracker = SimpleNamespace(classifier_trained=MagicMock())
+        self.settings_manager = SimpleNamespace(
+            classifier_mode=mode,
+            cv_grouping_strategy=CrossValidationGroupingStrategy.INDIVIDUAL,
+            get_behavior=lambda _behavior: dict(self._DEFAULT_BEHAVIOR_SETTINGS),
+        )
+        self._binary_features = binary_features
+        self._multiclass_features = multiclass_features
+        self.binary_calls = 0
+        self.multiclass_calls = 0
+        self.save_classifier = MagicMock()
+
+    def get_project_defaults(self) -> dict:
+        """Return the default project settings used when classifier has none."""
+        return dict(self._DEFAULT_BEHAVIOR_SETTINGS)
+
+    def get_labeled_features(
+        self,
+        behavior: str,
+        progress_callable=None,
+        should_terminate_callable=None,
+    ) -> tuple[dict, dict]:
+        """Return the binary feature payload and a single-identity group mapping."""
+        self.binary_calls += 1
+        if should_terminate_callable is not None:
+            should_terminate_callable()
+        if progress_callable is not None:
+            progress_callable()
+        return self._binary_features, {0: {"video": "video.avi", "identity": 0}}
+
+    def get_multiclass_labeled_features(
+        self,
+        progress_callable=None,
+        should_terminate_callable=None,
+        grouping_strategy=None,
+        behavior_settings=None,
+    ) -> tuple[dict, dict]:
+        """Return the multi-class feature payload and an empty group mapping."""
+        self.multiclass_calls += 1
+        if should_terminate_callable is not None:
+            should_terminate_callable()
+        if progress_callable is not None:
+            progress_callable()
+        return self._multiclass_features, {}
+
+    @staticmethod
+    def counts(_behavior: str) -> dict:
+        """Return a single-video, single-identity bout-count entry."""
+        return {"video.avi": {0: {"unfragmented_bout_counts": (1, 0)}}}
+
+
+class FakeClassifyingProject:
+    """Minimal Project stand-in implementing the methods and attributes ``ClassifyThread`` calls.
+
+    ``save_predictions`` is a ``MagicMock`` so tests can assert how predictions
+    were persisted; ``load_pose_est`` returns a fixed five-frame pose stand-in.
+    """
+
+    def __init__(self, mode: ClassifierMode) -> None:
+        self.settings_manager = SimpleNamespace(
+            classifier_mode=mode,
+            get_behavior=lambda _behavior: {"window_size": 5, "postprocessing": []},
+        )
+        self.feature_manager = SimpleNamespace(distance_unit=ProjectDistanceUnit.PIXEL)
+        self.video_manager = SimpleNamespace(
+            videos=["video.avi"],
+            video_path=lambda _video: "video.avi",
+            num_videos=1,
+        )
+        self.feature_dir = "feature_dir"
+        self.cache_format = "hdf5"
+        self.save_predictions = MagicMock()
+        self._pose = SimpleNamespace(
+            identities=[0],
+            num_identities=1,
+            num_frames=5,
+            fps=30,
+        )
+
+    def load_pose_est(self, _video_path) -> SimpleNamespace:
+        """Return the fixed pose-estimation stand-in."""
+        return self._pose
+
+    @staticmethod
+    def get_project_defaults() -> dict:
+        """Return the default project settings used when classifier has none."""
+        return {"window_size": 9}
+
+
+# ---------------------------------------------------------------------------
+# IdentityFeatures factory
+# ---------------------------------------------------------------------------
+
+
+def make_fake_identity_features() -> type:
+    """Return a fresh ``IdentityFeatures`` stand-in class with isolated state.
+
+    Tests typically ``monkeypatch.setattr(..., make_fake_identity_features())``
+    onto the production import site. Each call returns a fresh class so the
+    ``op_settings_seen`` record does not leak between tests.
+
+    The returned class exposes the same surface ``ClassifyThread`` uses:
+    ``__init__(*args, **kwargs)`` records the ``op_settings`` kwarg, and
+    ``get_features(window_size)`` returns a fixed 5-frame feature payload.
+    """
+
+    class FakeIdentityFeatures:
+        op_settings_seen: list[dict] = []  # noqa: RUF012 - per-class fresh state
+
+        def __init__(self, *_args, **kwargs) -> None:
+            self.__class__.op_settings_seen.append(kwargs.get("op_settings", {}))
+
+        @staticmethod
+        def get_features(_window_size: int) -> dict:
+            return {
+                "per_frame": {"a": np.arange(5, dtype=np.float32)},
+                "window": {"b": np.arange(5, dtype=np.float32)},
+                "frame_indexes": np.arange(5, dtype=np.intp),
+            }
+
+    return FakeIdentityFeatures
diff --git a/tests/ui/test_central_widget_mode.py b/tests/ui/test_central_widget_mode.py
new file mode 100644
index 00000000..1dffd17c
--- /dev/null
+++ b/tests/ui/test_central_widget_mode.py
@@ -0,0 +1,256 @@
+"""Tests for the central_widget_mode per-mode dispatch helpers."""
+
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+
+from jabs.core.constants import MULTICLASS_NONE_BEHAVIOR
+from jabs.core.enums import ClassifierMode
+
+try:
+    from jabs.ui.main_window import central_widget_mode
+
+    SKIP_UI_TESTS = False
+    SKIP_REASON = None
+except ImportError as e:
+    SKIP_UI_TESTS = True
+    SKIP_REASON = f"Qt/UI dependencies not available: {e}"
+
+pytestmark = pytest.mark.skipif(
+    SKIP_UI_TESTS,
+    reason=SKIP_REASON if SKIP_UI_TESTS else "",
+)
+
+
+# ---------------------------------------------------------------------------
+# load_video_predictions
+# ---------------------------------------------------------------------------
+
+
+def test_load_video_predictions_binary_returns_none_class_names() -> None:
+    """Binary mode delegates to load_predictions and returns None for class_names."""
+    prediction_manager = SimpleNamespace(
+        load_predictions=MagicMock(
+            return_value=({0: np.zeros(3)}, {0: np.zeros(3)}, {0: np.zeros(3)})
+        ),
+        load_multiclass_predictions=MagicMock(),
+    )
+
+    preds, probs, postprocessed, class_names = central_widget_mode.load_video_predictions(
+        prediction_manager,
+        ClassifierMode.BINARY,
+        video_name="video.avi",
+        behavior="Walk",
+    )
+
+    prediction_manager.load_predictions.assert_called_once_with("video.avi", "Walk")
+    prediction_manager.load_multiclass_predictions.assert_not_called()
+    assert class_names is None
+    assert 0 in preds and 0 in probs and 0 in postprocessed
+
+
+def test_load_video_predictions_multiclass_returns_class_names() -> None:
+    """Multi-class mode delegates to load_multiclass_predictions and forwards class_names."""
+    prediction_manager = SimpleNamespace(
+        load_predictions=MagicMock(),
+        load_multiclass_predictions=MagicMock(
+            return_value=({0: np.zeros(3)}, {0: np.zeros((3, 3))}, {}, ["None", "Walk", "Run"])
+        ),
+    )
+
+    preds, probs, postprocessed, class_names = central_widget_mode.load_video_predictions(
+        prediction_manager,
+        ClassifierMode.MULTICLASS,
+        video_name="video.avi",
+        behavior="Walk",
+    )
+
+    prediction_manager.load_multiclass_predictions.assert_called_once_with("video.avi")
+    prediction_manager.load_predictions.assert_not_called()
+    assert class_names == ["None", "Walk", "Run"]
+    assert probs[0].ndim == 2
+
+
+# ---------------------------------------------------------------------------
+# apply_behavior_label
+# ---------------------------------------------------------------------------
+
+
+def _make_track_mock() -> MagicMock:
+    track = MagicMock()
+    track.label_behavior = MagicMock()
+    track.label_not_behavior = MagicMock()
+    track.clear_labels = MagicMock()
+    return track
+
+
+def test_apply_behavior_label_binary_does_not_clear_competing() -> None:
+    """Binary mode labels the current track and never iterates competing behaviors."""
+    current_track = _make_track_mock()
+    labels = SimpleNamespace(
+        get_track_labels=MagicMock(return_value=current_track),
+        iter_behavior_labels=MagicMock(),
+    )
+
+    central_widget_mode.apply_behavior_label(
+        labels,
+        ClassifierMode.BINARY,
+        identity_str="0",
+        current_behavior="Walk",
+        start=10,
+        end=20,
+    )
+
+    labels.iter_behavior_labels.assert_not_called()
+    labels.get_track_labels.assert_called_once_with("0", "Walk")
+    current_track.label_behavior.assert_called_once_with(10, 20)
+
+
+def test_apply_behavior_label_multiclass_clears_competing_then_labels() -> None:
+    """Multi-class mode clears non-current behavior tracks on the range, then labels current."""
+    competing_track = _make_track_mock()
+    current_track = _make_track_mock()
+    labels = SimpleNamespace(
+        get_track_labels=MagicMock(return_value=current_track),
+        iter_behavior_labels=MagicMock(
+            return_value=iter([("Walk", current_track), ("Run", competing_track)])
+        ),
+    )
+
+    central_widget_mode.apply_behavior_label(
+        labels,
+        ClassifierMode.MULTICLASS,
+        identity_str="0",
+        current_behavior="Walk",
+        start=10,
+        end=20,
+    )
+
+    competing_track.clear_labels.assert_called_once_with(10, 20)
+    current_track.clear_labels.assert_not_called()
+    current_track.label_behavior.assert_called_once_with(10, 20)
+
+
+# ---------------------------------------------------------------------------
+# apply_not_behavior_label
+# ---------------------------------------------------------------------------
+
+
+def test_apply_not_behavior_label_binary_returns_current_behavior_false() -> None:
+    """Binary mode labels the current track as not-behavior and reports (behavior, False)."""
+    current_track = _make_track_mock()
+    labels = SimpleNamespace(
+        get_track_labels=MagicMock(return_value=current_track),
+        iter_behavior_labels=MagicMock(),
+    )
+
+    behavior_key, is_positive = central_widget_mode.apply_not_behavior_label(
+        labels,
+        ClassifierMode.BINARY,
+        identity_str="0",
+        current_behavior="Walk",
+        start=5,
+        end=15,
+    )
+
+    assert behavior_key == "Walk"
+    assert is_positive is False
+    labels.iter_behavior_labels.assert_not_called()
+    current_track.label_not_behavior.assert_called_once_with(5, 15)
+
+
+def test_apply_not_behavior_label_multiclass_returns_none_key_true() -> None:
+    """Multi-class mode clears competing tracks and labels the NONE track as positive."""
+    none_track = _make_track_mock()
+    competing_track = _make_track_mock()
+    labels = SimpleNamespace(
+        get_track_labels=MagicMock(return_value=none_track),
+        iter_behavior_labels=MagicMock(
+            return_value=iter([(MULTICLASS_NONE_BEHAVIOR, none_track), ("Walk", competing_track)])
+        ),
+    )
+
+    behavior_key, is_positive = central_widget_mode.apply_not_behavior_label(
+        labels,
+        ClassifierMode.MULTICLASS,
+        identity_str="0",
+        current_behavior="Walk",
+        start=5,
+        end=15,
+    )
+
+    assert behavior_key == MULTICLASS_NONE_BEHAVIOR
+    assert is_positive is True
+    competing_track.clear_labels.assert_called_once_with(5, 15)
+    none_track.clear_labels.assert_not_called()
+    labels.get_track_labels.assert_called_once_with("0", MULTICLASS_NONE_BEHAVIOR)
+    none_track.label_behavior.assert_called_once_with(5, 15)
+
+
+# ---------------------------------------------------------------------------
+# build_timeline_label_arrays
+# ---------------------------------------------------------------------------
+
+
+def test_build_timeline_label_arrays_multiclass_uses_merged_arrays() -> None:
+    """Multi-class returns one merged label array per identity from VideoLabels."""
+    expected = [
+        np.array([0, 1, 2, 0], dtype=np.int16),
+        np.array([1, 0, 0, 2], dtype=np.int16),
+    ]
+    labels = SimpleNamespace(
+        build_multiclass_label_array=MagicMock(side_effect=expected),
+        get_track_labels=MagicMock(),
+    )
+
+    result = central_widget_mode.build_timeline_label_arrays(
+        labels,
+        ClassifierMode.MULTICLASS,
+        num_identities=2,
+        current_behavior="Walk",
+        behaviors=["Walk", "Run"],
+    )
+
+    labels.get_track_labels.assert_not_called()
+    assert labels.build_multiclass_label_array.call_count == 2
+    labels.build_multiclass_label_array.assert_any_call("0", ["Walk", "Run"])
+    labels.build_multiclass_label_array.assert_any_call("1", ["Walk", "Run"])
+    np.testing.assert_array_equal(result[0], expected[0])
+    np.testing.assert_array_equal(result[1], expected[1])
+
+
+def test_build_timeline_label_arrays_binary_uses_lut_indices(monkeypatch) -> None:
+    """Binary returns one LUT-index array per identity for current_behavior only."""
+    lut_call_args: list = []
+
+    def fake_lut(track):
+        lut_call_args.append(track)
+        return np.array([1, 2, 3], dtype=np.int16)
+
+    monkeypatch.setattr(
+        "jabs.ui.main_window.central_widget_mode.track_labels_to_lut_indices",
+        fake_lut,
+    )
+
+    track_a = MagicMock(name="track_a")
+    track_b = MagicMock(name="track_b")
+    labels = SimpleNamespace(
+        get_track_labels=MagicMock(side_effect=[track_a, track_b]),
+        build_multiclass_label_array=MagicMock(),
+    )
+
+    result = central_widget_mode.build_timeline_label_arrays(
+        labels,
+        ClassifierMode.BINARY,
+        num_identities=2,
+        current_behavior="Walk",
+        behaviors=["Walk", "Run"],
+    )
+
+    labels.build_multiclass_label_array.assert_not_called()
+    labels.get_track_labels.assert_any_call("0", "Walk")
+    labels.get_track_labels.assert_any_call("1", "Walk")
+    assert lut_call_args == [track_a, track_b]
+    assert len(result) == 2
diff --git a/tests/ui/test_central_widget_multiclass.py b/tests/ui/test_central_widget_multiclass.py
index d6229ec4..9611adfe 100644
--- a/tests/ui/test_central_widget_multiclass.py
+++ b/tests/ui/test_central_widget_multiclass.py
@@ -6,7 +6,6 @@
 import pytest
 
 try:
-    from jabs.core.enums import CrossValidationGroupingStrategy
     from jabs.ui.main_window.central_widget import CentralWidget
 
     SKIP_UI_TESTS = False
@@ -115,66 +114,3 @@ def test_get_multiclass_prediction_rows_falls_back_on_frame_mismatch() -> None:
         np.testing.assert_array_equal(row, np.zeros(4, dtype=np.int16))
     for row in probability_rows[0]:
         np.testing.assert_array_equal(row, np.zeros(4, dtype=np.float32))
-
-
-def test_count_multiclass_valid_logo_splits_individual() -> None:
-    """Valid-split counting matches multiclass LOGO constraints for per-identity groups."""
-    counts_by_behavior = {
-        "None": {
-            "video_a.avi": {
-                0: {"fragmented_frame_counts": (20, 0)},
-                1: {"fragmented_frame_counts": (20, 0)},
-                2: {"fragmented_frame_counts": (20, 0)},
-            }
-        },
-        "Walk": {
-            "video_a.avi": {
-                0: {"fragmented_frame_counts": (20, 0)},
-                1: {"fragmented_frame_counts": (20, 0)},
-                2: {"fragmented_frame_counts": (20, 0)},
-            }
-        },
-        "Run": {
-            "video_a.avi": {
-                0: {"fragmented_frame_counts": (0, 0)},
-                1: {"fragmented_frame_counts": (20, 0)},
-                2: {"fragmented_frame_counts": (20, 0)},
-            }
-        },
-    }
-
-    valid = CentralWidget._count_multiclass_valid_logo_splits(
-        counts_by_behavior=counts_by_behavior,
-        behavior_names=["None", "Walk", "Run"],
-        grouping_strategy=CrossValidationGroupingStrategy.INDIVIDUAL,
-        threshold=20,
-    )
-
-    assert valid == 3
-
-
-def test_count_multiclass_valid_logo_splits_video_grouping() -> None:
-    """Video grouping aggregates identities per video before validity checks."""
-    counts_by_behavior = {
-        "None": {
-            "video_a.avi": {0: {"fragmented_frame_counts": (20, 0)}},
-            "video_b.avi": {0: {"fragmented_frame_counts": (20, 0)}},
-        },
-        "Walk": {
-            "video_a.avi": {0: {"fragmented_frame_counts": (20, 0)}},
-            "video_b.avi": {0: {"fragmented_frame_counts": (20, 0)}},
-        },
-        "Run": {
-            "video_a.avi": {0: {"fragmented_frame_counts": (20, 0)}},
-            "video_b.avi": {0: {"fragmented_frame_counts": (0, 0)}},
-        },
-    }
-
-    valid = CentralWidget._count_multiclass_valid_logo_splits(
-        counts_by_behavior=counts_by_behavior,
-        behavior_names=["None", "Walk", "Run"],
-        grouping_strategy=CrossValidationGroupingStrategy.VIDEO,
-        threshold=20,
-    )
-
-    assert valid == 1
diff --git a/tests/ui/test_classification_thread.py b/tests/ui/test_classification_thread.py
index 13b20f30..18b694b5 100644
--- a/tests/ui/test_classification_thread.py
+++ b/tests/ui/test_classification_thread.py
@@ -1,16 +1,17 @@
 """Tests for ClassifyThread binary and multiclass branches."""
 
-from types import SimpleNamespace
-from typing import ClassVar
-from unittest.mock import MagicMock
-
-import numpy as np
 import pytest
 
+from jabs.core.enums import ClassifierMode
+from jabs.project.prediction_manager import MULTICLASS_PREDICTION_KEY
+
+from ._fakes import (
+    FakeClassifyingClassifier,
+    FakeClassifyingProject,
+    make_fake_identity_features,
+)
+
 try:
-    from jabs.core.constants import MULTICLASS_NONE_BEHAVIOR
-    from jabs.core.enums import ClassifierMode, ProjectDistanceUnit
-    from jabs.project.prediction_manager import MULTICLASS_PREDICTION_KEY
     from jabs.ui.classification_thread import ClassifyThread
 
     SKIP_UI_TESTS = False
@@ -25,98 +26,9 @@
 )
 
 
-class _FakeClassifier:
-    """Simple classifier test double for ClassifyThread."""
-
-    def __init__(self, multiclass: bool = False) -> None:
-        self._multiclass = multiclass
-        self.project_settings = {"window_size": 7}
-        self.behavior_names = ["Walk", "Run"]
-
-    @staticmethod
-    def combine_data(per_frame, window):
-        import pandas as pd
-
-        return pd.concat([per_frame, window], axis=1)
-
-    def predict_proba(self, data, frame_indexes):
-        n = len(data)
-        if self._multiclass:
-            probs = np.zeros((n, 3), dtype=np.float32)
-            probs[:, 0] = 0.1
-            probs[:, 1] = 0.7
-            probs[:, 2] = 0.2
-            probs[frame_indexes == -1] = 0.0
-            return probs
-
-        probs = np.zeros((n, 2), dtype=np.float32)
-        probs[:, 0] = 0.2
-        probs[:, 1] = 0.8
-        return probs
-
-    def derive_predictions(self, probabilities):
-        predictions = np.argmax(probabilities, axis=1).astype(np.int8)
-        confidence = probabilities[np.arange(len(probabilities)), predictions].astype(np.float32)
-        predictions[confidence == 0] = -1
-        return predictions, confidence
-
-    @staticmethod
-    def get_class_names() -> list[str]:
-        return [MULTICLASS_NONE_BEHAVIOR, "Walk", "Run"]
-
-
-class _FakePose:
-    """Pose-estimation test double."""
-
-    def __init__(self) -> None:
-        self.identities = [0]
-        self.num_identities = 1
-        self.num_frames = 5
-        self.fps = 30
-
-
-class _FakeProject:
-    """Project test double for classification thread tests."""
-
-    def __init__(self, mode: ClassifierMode) -> None:
-        self.settings_manager = SimpleNamespace(
-            classifier_mode=mode,
-            get_behavior=lambda _behavior: {"window_size": 5, "postprocessing": []},
-        )
-        self.feature_manager = SimpleNamespace(distance_unit=ProjectDistanceUnit.PIXEL)
-        self.video_manager = SimpleNamespace(
-            videos=["video.avi"],
-            video_path=lambda _video: "video.avi",
-            num_videos=1,
-        )
-        self.feature_dir = "feature_dir"
-        self.cache_format = "hdf5"
-        self.save_predictions = MagicMock()
-        self._pose = _FakePose()
-
-    def load_pose_est(self, _video_path):
-        return self._pose
-
-    @staticmethod
-    def get_project_defaults() -> dict:
-        return {"window_size": 9}
-
-
 def test_classify_thread_binary_path(monkeypatch) -> None:
     """Binary mode applies postprocessing and writes behavior-scoped predictions."""
 
-    class _FakeIdentityFeatures:
-        def __init__(self, *_args, **_kwargs):
-            pass
-
-        @staticmethod
-        def get_features(_window_size):
-            return {
-                "per_frame": {"a": np.arange(5, dtype=np.float32)},
-                "window": {"b": np.arange(5, dtype=np.float32)},
-                "frame_indexes": np.arange(5, dtype=np.intp),
-            }
-
     class _FakePostprocessingPipeline:
         def __init__(self, _config):
             pass
@@ -125,14 +37,17 @@ def __init__(self, _config):
         def run(predictions, _probabilities):
             return predictions.copy()
 
-    monkeypatch.setattr("jabs.ui.classification_thread.IdentityFeatures", _FakeIdentityFeatures)
     monkeypatch.setattr(
-        "jabs.ui.classification_thread.PostprocessingPipeline",
+        "jabs.ui.classification_thread.IdentityFeatures",
+        make_fake_identity_features(),
+    )
+    monkeypatch.setattr(
+        "jabs.ui.classify_strategy.PostprocessingPipeline",
         _FakePostprocessingPipeline,
     )
 
-    project = _FakeProject(ClassifierMode.BINARY)
-    classifier = _FakeClassifier(multiclass=False)
+    project = FakeClassifyingProject(ClassifierMode.BINARY)
+    classifier = FakeClassifyingClassifier(multiclass=False)
     thread = ClassifyThread(classifier, project, "Walk", "video.avi")
     completions: list[dict] = []
     errors: list[Exception] = []
@@ -159,34 +74,21 @@ def run(predictions, _probabilities):
 def test_classify_thread_multiclass_path(monkeypatch) -> None:
     """Multiclass mode skips postprocessing and writes reserved-key predictions with class names."""
 
-    class _FakeIdentityFeatures:
-        op_settings_seen: ClassVar[list[dict]] = []
-
-        def __init__(self, *_args, **kwargs):
-            self.__class__.op_settings_seen.append(kwargs["op_settings"])
-
-        @staticmethod
-        def get_features(_window_size):
-            return {
-                "per_frame": {"a": np.arange(5, dtype=np.float32)},
-                "window": {"b": np.arange(5, dtype=np.float32)},
-                "frame_indexes": np.arange(5, dtype=np.intp),
-            }
-
     class _PostprocessingMustNotRun:
         def __init__(self, _config):
             raise AssertionError(
                 "PostprocessingPipeline should not be instantiated in multiclass mode"
             )
 
-    monkeypatch.setattr("jabs.ui.classification_thread.IdentityFeatures", _FakeIdentityFeatures)
+    fake_features_cls = make_fake_identity_features()
+    monkeypatch.setattr("jabs.ui.classification_thread.IdentityFeatures", fake_features_cls)
     monkeypatch.setattr(
-        "jabs.ui.classification_thread.PostprocessingPipeline",
+        "jabs.ui.classify_strategy.PostprocessingPipeline",
         _PostprocessingMustNotRun,
     )
 
-    project = _FakeProject(ClassifierMode.MULTICLASS)
-    classifier = _FakeClassifier(multiclass=True)
+    project = FakeClassifyingProject(ClassifierMode.MULTICLASS)
+    classifier = FakeClassifyingClassifier(multiclass=True)
     thread = ClassifyThread(classifier, project, "Walk", "video.avi")
     completions: list[dict] = []
     errors: list[Exception] = []
@@ -208,4 +110,4 @@ def __init__(self, _config):
     assert args[4] == MULTICLASS_PREDICTION_KEY
     assert kwargs["class_names"] == ["None", "Walk", "Run"]
     assert kwargs["postprocessed_predictions"] == {}
-    assert _FakeIdentityFeatures.op_settings_seen[0]["window_size"] == 7
+    assert fake_features_cls.op_settings_seen[0]["window_size"] == 7
diff --git a/tests/ui/test_training_thread.py b/tests/ui/test_training_thread.py
index b7038551..3280edba 100644
--- a/tests/ui/test_training_thread.py
+++ b/tests/ui/test_training_thread.py
@@ -1,19 +1,14 @@
 """Tests for TrainingThread binary and multiclass training branches."""
 
-from types import SimpleNamespace
-from unittest.mock import MagicMock
-
 import numpy as np
 import pandas as pd
 import pytest
 
+from jabs.core.enums import ClassifierMode
+
+from ._fakes import FakeTrainingClassifier, FakeTrainingProject
+
 try:
-    from jabs.core.constants import MULTICLASS_NONE_BEHAVIOR
-    from jabs.core.enums import (
-        ClassifierMode,
-        CrossValidationGroupingStrategy,
-        ProjectDistanceUnit,
-    )
     from jabs.ui.training_thread import TrainingThread
 
     SKIP_UI_TESTS = False
@@ -28,104 +23,6 @@
 )
 
 
-class _FakeClassifier:
-    """Small classifier test double for TrainingThread."""
-
-    def __init__(self, name: str = "random_forest", project_settings: dict | None = None) -> None:
-        self.classifier_name = name
-        self.project_settings = {} if project_settings is None else dict(project_settings)
-        self.behavior_names = ["Walk", "Run"]
-        self.train_calls: list[dict] = []
-
-    @staticmethod
-    def combine_data(per_frame: pd.DataFrame, window: pd.DataFrame) -> pd.DataFrame:
-        return pd.concat([per_frame, window], axis=1)
-
-    def train(self, data: dict, random_seed: int | None = None) -> None:
-        call = dict(data)
-        call["random_seed"] = random_seed
-        self.train_calls.append(call)
-
-    def get_class_names(self) -> list[str]:
-        return [MULTICLASS_NONE_BEHAVIOR, *self.behavior_names]
-
-    @staticmethod
-    def get_feature_importance(limit: int = 20) -> list[tuple[str, float]]:
-        return [("feat_a", 1.0)][:limit]
-
-
-class _FakeProject:
-    """Project test double providing only the APIs TrainingThread uses."""
-
-    def __init__(
-        self,
-        tmp_path,
-        mode: ClassifierMode,
-        binary_features: dict | None = None,
-        multiclass_features: dict | None = None,
-    ) -> None:
-        self.project_paths = SimpleNamespace(training_log_dir=tmp_path)
-        self.feature_manager = SimpleNamespace(distance_unit=ProjectDistanceUnit.PIXEL)
-        self.session_tracker = SimpleNamespace(classifier_trained=MagicMock())
-        self.settings_manager = SimpleNamespace(
-            classifier_mode=mode,
-            cv_grouping_strategy=CrossValidationGroupingStrategy.INDIVIDUAL,
-            get_behavior=lambda _behavior: {
-                "window_size": 5,
-                "balance_labels": False,
-                "symmetric_behavior": False,
-            },
-        )
-        self._binary_features = binary_features
-        self._multiclass_features = multiclass_features
-        self.binary_calls = 0
-        self.multiclass_calls = 0
-        self.save_classifier = MagicMock()
-
-    def get_project_defaults(self) -> dict:
-        return {
-            "window_size": 5,
-            "balance_labels": False,
-            "symmetric_behavior": False,
-        }
-
-    def get_labeled_features(
-        self,
-        behavior: str,
-        progress_callable=None,
-        should_terminate_callable=None,
-    ) -> tuple[dict, dict]:
-        self.binary_calls += 1
-        if should_terminate_callable is not None:
-            should_terminate_callable()
-        if progress_callable is not None:
-            progress_callable()
-        return self._binary_features, {0: {"video": "video.avi", "identity": 0}}
-
-    def get_multiclass_labeled_features(
-        self,
-        progress_callable=None,
-        should_terminate_callable=None,
-        behavior_settings=None,
-    ) -> tuple[dict, dict]:
-        self.multiclass_calls += 1
-        if should_terminate_callable is not None:
-            should_terminate_callable()
-        if progress_callable is not None:
-            progress_callable()
-        return self._multiclass_features, {}
-
-    @staticmethod
-    def counts(_behavior: str) -> dict:
-        return {
-            "video.avi": {
-                0: {
-                    "unfragmented_bout_counts": (1, 0),
-                }
-            }
-        }
-
-
 def test_training_thread_binary_path(monkeypatch, tmp_path) -> None:
     """Binary mode uses get_labeled_features + CV/report path and saves behavior-scoped classifier."""
     features = {
@@ -134,8 +31,8 @@ def test_training_thread_binary_path(monkeypatch, tmp_path) -> None:
         "labels": np.array([1, 0], dtype=np.int8),
         "groups": np.array([0, 1], dtype=np.int32),
     }
-    project = _FakeProject(tmp_path, ClassifierMode.BINARY, binary_features=features)
-    classifier = _FakeClassifier()
+    project = FakeTrainingProject(tmp_path, ClassifierMode.BINARY, binary_features=features)
+    classifier = FakeTrainingClassifier()
 
     cv_called = {"count": 0}
 
@@ -186,8 +83,10 @@ def test_training_thread_multiclass_path(monkeypatch, tmp_path) -> None:
         },
         "groups": np.array([0, 0, 1], dtype=np.int32),
     }
-    project = _FakeProject(tmp_path, ClassifierMode.MULTICLASS, multiclass_features=features)
-    classifier = _FakeClassifier(name="catboost", project_settings={"window_size": 7})
+    project = FakeTrainingProject(
+        tmp_path, ClassifierMode.MULTICLASS, multiclass_features=features
+    )
+    classifier = FakeTrainingClassifier(name="catboost", project_settings={"window_size": 7})
 
     cv_called = {"count": 0}