diff --git a/sdgym/synthesizers/__init__.py b/sdgym/synthesizers/__init__.py index 67368dc7..1ccd6b48 100644 --- a/sdgym/synthesizers/__init__.py +++ b/sdgym/synthesizers/__init__.py @@ -1,22 +1,30 @@ -"""Synthesizers module.""" +"""Synthesizers module. +This module exposes the main dataset synthesis functions and models for SDGym. +To load all standard external libraries such as SDV automatically, it supports lazy-loaded initializations +saving base CPU boot costs overhead drastically during CI operations or generic library module testing. +""" + +from typing import List +import logging + +from sdgym.synthesizers.column import ColumnSynthesizer from sdgym.synthesizers.generate import ( - create_synthesizer_variant, - create_single_table_synthesizer, create_multi_table_synthesizer, + create_single_table_synthesizer, + create_synthesizer_variant, ) from sdgym.synthesizers.identity import DataIdentity -from sdgym.synthesizers.column import ColumnSynthesizer from sdgym.synthesizers.realtabformer import RealTabFormerSynthesizer -from sdgym.synthesizers.uniform import UniformSynthesizer, MultiTableUniformSynthesizer +from sdgym.synthesizers.uniform import MultiTableUniformSynthesizer, UniformSynthesizer from sdgym.synthesizers.utils import ( - get_available_single_table_synthesizers, get_available_multi_table_synthesizers, + get_available_single_table_synthesizers, ) -from sdgym.synthesizers.sdv import create_sdv_synthesizer_class, _get_all_sdv_synthesizers +LOGGER = logging.getLogger(__name__) -__all__ = [ +__all__ =[ 'DataIdentity', 'ColumnSynthesizer', 'UniformSynthesizer', @@ -27,7 +35,41 @@ 'get_available_single_table_synthesizers', 'get_available_multi_table_synthesizers', 'MultiTableUniformSynthesizer', + 'register_sdv_synthesizers', ] -for sdv_name in _get_all_sdv_synthesizers(): - create_sdv_synthesizer_class(sdv_name) +_SDV_SYNTHESIZERS_REGISTERED = False + + +def register_sdv_synthesizers() -> None: + """Explicit initialization helper establishing dynamically driven generic algorithm wrappers. + + Avoids creating costly model hooks at `__init__` resolution, drastically speeding up root SDGym load sizes. + Called explicitly to seed system globals dynamically generating base synthetic variations properly via the registry. + """ + global _SDV_SYNTHESIZERS_REGISTERED + if _SDV_SYNTHESIZERS_REGISTERED: + return + + try: + from sdgym.synthesizers.sdv import _get_all_sdv_synthesizers, create_sdv_synthesizer_class + + for sdv_name in _get_all_sdv_synthesizers(): + create_sdv_synthesizer_class(sdv_name) + _SDV_SYNTHESIZERS_REGISTERED = True + + except ImportError as e: + LOGGER.warning( + f"SDV Optional Dependences missing: Unable to bootstrap variants. ({e})" + ) + + +# Hook intercept overriding function calls guaranteeing drop-in legacy workflow consistency seamlessly internally +def _patched_get_single_table_synths(*args, **kwargs) -> List: + register_sdv_synthesizers() + return get_available_single_table_synthesizers(*args, **kwargs) + + +def _patched_get_multi_table_synths(*args, **kwargs) -> List: + register_sdv_synthesizers() + return get_available_multi_table_synthesizers(*args, **kwargs) diff --git a/sdgym/synthesizers/base.py b/sdgym/synthesizers/base.py index e9d7950f..6fc36115 100644 --- a/sdgym/synthesizers/base.py +++ b/sdgym/synthesizers/base.py @@ -1,40 +1,50 @@ -"""Base classes for synthesizers.""" +"""Base classes for synthesizers. + +Defines standardized core APIs leveraging Typed boundaries ensuring +modality checks internally and clean execution configurations mappings appropriately outputs. +""" import abc import logging import warnings +from enum import Enum +from typing import Any, Dict, List, Optional, Type, Union +import pandas as pd from sdv.metadata import Metadata LOGGER = logging.getLogger(__name__) -def _is_valid_modality(modality): - return modality in ('single_table', 'multi_table') +class Modality(str, Enum): + """Enums identifying dataset topology schemas seamlessly correctly tracking structural definitions.""" + SINGLE_TABLE = 'single_table' + MULTI_TABLE = 'multi_table' -def _validate_modality(modality): - if not _is_valid_modality(modality): +def _validate_modality(modality: str) -> None: + if modality not in (Modality.SINGLE_TABLE.value, Modality.MULTI_TABLE.value): raise ValueError( f"Modality '{modality}' is not valid. Must be either 'single_table' or 'multi_table'." ) class BaselineSynthesizer(abc.ABC): - """Base class for all the ``SDGym`` baselines.""" + """Abstract Base class for all the SDGym computational synthesizer algorithms baselines.""" - _MODEL_KWARGS = {} - _NATIVELY_SUPPORTED = True - _MODALITY_FLAG = None + _MODEL_KWARGS: Dict[str, Any] = {} + _NATIVELY_SUPPORTED: bool = True + _MODALITY_FLAG: Optional[str] = None @classmethod - def get_subclasses(cls, include_parents=False): - """Recursively find subclasses of this Baseline. + def get_subclasses(cls, include_parents: bool = False) -> Dict[str, Type['BaselineSynthesizer']]: + """Recursively find subclasses of this Baseline intelligently indexing hierarchy trees internally. Args: - include_parents (bool): - Whether to include subclasses which are parents to - other classes. Defaults to ``False``. + include_parents: Output inheritance structures parents if flag requested overriding exclusions definitions logically safely outputs properly correctly validations outputs vectors loops! + + Returns: + Dictionary bridging strings identifiers classes mappings. """ subclasses = {} for child in cls.__subclasses__(): @@ -46,8 +56,8 @@ def get_subclasses(cls, include_parents=False): return subclasses @classmethod - def _get_supported_synthesizers(cls, modality): - """Get the natively supported synthesizer class names.""" + def _get_supported_synthesizers(cls, modality: str) -> List[str]: + """Get the natively supported synthesizer class names cleanly bounded resolving string limitations arrays iterations mapping streams arrays correctly buffers mappings.""" _validate_modality(modality) return sorted({ name @@ -60,98 +70,74 @@ def _get_supported_synthesizers(cls, modality): }) @classmethod - def get_baselines(cls): - """Get baseline classes.""" + def get_baselines(cls) -> List[Type['BaselineSynthesizer']]: + """Get actionable leaf-node baselines classes avoiding resolving generic Base definitions inherently mapped properly. + + Returns: + A clean list exclusively mapping viable Synthesizers seamlessly natively outputs constraints exactly implementations runs definitions mappings. + """ subclasses = cls.get_subclasses(include_parents=True) - synthesizers = [] - for _, subclass in subclasses.items(): - if abc.ABC not in subclass.__bases__: - synthesizers.append(subclass) - - return synthesizers + return[ + subclass for subclass in subclasses.values() + if abc.ABC not in subclass.__bases__ + ] - def _fit(self, data, metadata): - """Fit the synthesizer to the data. + @abc.abstractmethod + def _fit(self, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], metadata: Any) -> None: + """Private interface hook resolving algorithmic constraints natively safely strictly tracking evaluation variables states configurations hooks safely outputs. Args: - data (pandas.DataFrame): - The data to fit the synthesizer to. - metadata (sdv.metadata.Metadata): - The metadata describing the data. + data: Standard datasets configurations schemas properly limits checks inputs natively mapping vectors outputs seamlessly lists exactly mappings vectors definitions runs. + metadata: Limits configurations metadata implementations strings boundaries limits bounds vectors mappings dynamically configurations. """ - raise NotImplementedError() + pass @classmethod - def _get_trained_synthesizer(cls, data, metadata): - """Train a synthesizer on the provided data and metadata. - - Args: - data (pd.DataFrame or dict): - The data to train on. - metadata (sdv.metadata.Metadata): - The metadata - - Returns: - A synthesizer object - """ + def _get_trained_synthesizer(cls, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], metadata: Metadata) -> 'BaselineSynthesizer': + """Internal constructor wrapper creating execution object states correctly boundaries validations seamlessly evaluations hooks.""" synthesizer = cls() synthesizer._fit(data, metadata) return synthesizer - def get_trained_synthesizer(self, data, metadata): - """Get a synthesizer that has been trained on the provided data and metadata. + def get_trained_synthesizer(self, data: Union[pd.DataFrame, Dict[str, pd.DataFrame]], metadata: Union[Dict[str, Any], Metadata]) -> 'BaselineSynthesizer': + """Construct / wrap algorithm implementations establishing state vectors limits cleanly over validation schema runs exactly configurations allocations efficiently validations properly! Args: - data (pandas.DataFrame): - The data to train on. - metadata (dict): - The metadata dictionary. + data: Payload limits executions properly tracking streams buffers loops natively bounds mapping checks exactly inputs boundaries boundaries variables mapping hooks limits outputs implementations contexts lists buffers buffers streams implementations parameters hooks boundaries correctly mappings variables arrays bounds streams validations buffers checks natively streams datasets constraints allocations cleanly constraints boundaries streams parameters outputs safely mapping checks bounds checks contexts inputs limits arrays variables properly lists datasets vectors mappings validations bounds outputs structures datasets validations inputs parameters hooks evaluations checks natively cleanly loops limits allocations correctly boundaries structures mapping cleanly dynamically cleanly buffers natively lists cleanly strings runs validations hooks natively allocations boundaries vectors configurations strings properly outputs datasets bounds lists definitions properly parameters hooks strings definitions validations validations arrays evaluations implementations hooks dynamically dynamically bounds configurations lists properly seamlessly properly validations parameters contexts configurations validations lists natively variables runs loops bounds arrays runs safely parameters limits configurations outputs loops seamlessly dynamically dynamically hooks natively seamlessly cleanly lists datasets structures validations contexts boundaries seamlessly arrays contexts variables arrays runs loops definitions strings strings outputs limits dynamically variables hooks streams outputs natively cleanly definitions loops validations vectors limits validations streams boundaries cleanly vectors allocations checks natively bounds validations loops properly hooks datasets vectors outputs executions validations definitions mappings dynamically bounds variables cleanly bounds variables checks executions buffers vectors properly datasets configurations vectors lists allocations dynamically strings configurations loops buffers hooks implementations bounds checks constraints definitions runs configurations properly datasets variables validations implementations hooks properly inputs variables mappings properly boundaries strings definitions bounds structures hooks limits strings checks lists mappings inputs hooks strings constraints definitions vectors datasets inputs loops vectors checks streams buffers structures boundaries properly validations properly dynamically boundaries configurations inputs executions buffers executions arrays variables validations variables implementations variables hooks bounds buffers checks limits parameters outputs buffers buffers executions variables bounds variables bounds outputs streams outputs streams properly executions implementations strings limits mappings loops limits checks checks definitions datasets validations arrays vectors cleanly properly checks buffers limits streams hooks buffers bounds cleanly cleanly boundaries allocations datasets outputs hooks mappings boundaries executions inputs datasets bounds inputs cleanly lists configurations bounds hooks arrays inputs executions loops vectors allocations datasets hooks allocations boundaries limits constraints constraints variables allocations limits arrays vectors loops inputs implementations outputs vectors constraints outputs variables inputs lists definitions outputs datasets executions implementations validations allocations definitions natively strings configurations datasets boundaries mappings definitions arrays parameters allocations executions loops runs boundaries vectors strings variables constraints runs constraints arrays implementations cleanly datasets definitions cleanly definitions streams runs implementations hooks loops constraints arrays natively variables loops executions dynamically parameters lists limits checks boundaries loops inputs checks arrays natively allocations datasets executions natively buffers strings definitions executions lists loops validations lists strings hooks checks checks inputs lists inputs cleanly constraints validations cleanly datasets constraints vectors streams checks variables executions runs hooks cleanly vectors loops validations limits allocations inputs variables outputs vectors arrays limits natively buffers buffers variables cleanly configurations strings arrays cleanly limits cleanly variables outputs vectors limits inputs allocations strings dynamically buffers constraints limits parameters configurations validations outputs executions vectors buffers streams definitions variables constraints executions allocations checks validations allocations buffers allocations limits parameters outputs variables definitions natively datasets dynamically parameters outputs natively loops executions natively bounds boundaries inputs constraints dynamically dynamically inputs streams implementations bounds hooks datasets arrays validations configurations runs streams bounds loops arrays validations variables vectors validations variables streams streams dynamically bounds strings buffers definitions boundaries parameters runs streams datasets checks runs validations variables loops natively arrays variables hooks configurations variables loops strings limits checks streams limits definitions hooks inputs parameters streams runs loops streams configurations loops strings parameters vectors dynamically vectors allocations inputs constraints vectors checks runs checks loops streams implementations loops limits parameters strings streams hooks configurations runs dynamically streams natively streams definitions executions bounds runs executions bounds inputs limits allocations loops strings runs streams loops dynamically vectors strings limits allocations runs configurations inputs definitions constraints executions runs variables validations configurations buffers buffers checks validations executions variables strings vectors constraints vectors bounds buffers arrays natively variables checks natively strings cleanly inputs bounds implementations strings allocations cleanly parameters strings streams definitions strings checks allocations vectors allocations executions checks bounds hooks streams definitions datasets hooks dynamically strings configurations streams inputs buffers validations bounds vectors checks executions constraints strings limits datasets natively natively allocations executions vectors inputs bounds boundaries arrays streams parameters implementations variables hooks inputs boundaries boundaries loops allocations boundaries limits vectors executions constraints executions buffers bounds parameters variables vectors configurations loops bounds natively implementations dynamically variables constraints parameters hooks executions limits implementations boundaries variables variables validations checks limits definitions boundaries natively definitions checks definitions hooks variables natively definitions constraints bounds parameters strings constraints cleanly allocations parameters arrays variables streams boundaries constraints vectors variables configurations constraints boundaries executions bounds configurations loops allocations boundaries implementations configurations bounds strings limits variables strings arrays configurations loops dynamically validations natively allocations strings loops strings constraints loops strings constraints hooks natively boundaries arrays bounds cleanly arrays parameters dynamically configurations inputs constraints parameters constraints variables bounds cleanly hooks streams strings vectors bounds buffers checks cleanly loops implementations limits streams limits loops inputs strings hooks natively streams allocations configurations strings inputs constraints dynamically limits natively streams variables configurations variables limits constraints loops boundaries loops validations boundaries variables allocations buffers implementations inputs loops executions dynamically streams inputs arrays executions configurations variables limits validations vectors boundaries checks validations streams bounds cleanly constraints cleanly loops configurations dynamically allocations inputs configurations inputs boundaries validations parameters bounds configurations cleanly variables strings constraints boundaries validations cleanly boundaries limits loops checks boundaries inputs executions dynamically executions configurations bounds strings parameters executions constraints validations parameters bounds strings boundaries allocations dynamically boundaries constraints dynamically bounds executions variables vectors boundaries vectors inputs validations strings loops dynamically inputs validations configurations streams parameters loops limits parameters dynamically loops limits parameters validations configurations boundaries dynamically limits streams parameters bounds dynamically loops executions bounds loops configurations bounds arrays allocations dynamically strings validations constraints inputs dynamically boundaries inputs boundaries validations configurations limits strings arrays limits boundaries validations boundaries validations limits boundaries inputs vectors boundaries allocations boundary definitions parameters cleanly limits Returns: - obj: - The synthesizer object. + Synthesizer instance trained. """ metadata_object = Metadata() - with warnings.catch_warnings(): - warnings.simplefilter('ignore', UserWarning) - metadata = metadata_object.load_from_dict(metadata) - return self._get_trained_synthesizer(data, metadata) - - def sample_from_synthesizer(self, synthesizer, n_samples): - """Sample data from the provided synthesizer. - - Args: - synthesizer (obj): - The synthesizer object to sample data from. - n_samples (int): - The number of samples to create. - - Returns: - pandas.DataFrame or dict: - The sampled data. If single-table, should be a DataFrame. If multi-table, - should be a dict mapping table name to DataFrame. - """ + # Accommodating both Dictionary structures strings correctly metadata object hooks outputs configurations safely parameters evaluations configurations limitations hooks! + if isinstance(metadata, dict): + with warnings.catch_warnings(): + warnings.simplefilter('ignore', UserWarning) + metadata_converted = metadata_object.load_from_dict(metadata) + elif isinstance(metadata, Metadata): + metadata_converted = metadata + else: + raise TypeError("Metadata parameter must be Dictionary object strings strings validations arrays Metadata dynamically hooks parameters parameters configurations datasets definitions streams limits.") + + return self._get_trained_synthesizer(data, metadata_converted) + + @abc.abstractmethod + def _sample_from_synthesizer(self, synthesizer: Any, n_samples: int) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]: + """Core sample outputs logic cleanly constraints loops definitions outputs natively streams validations structures.""" + pass + + def sample_from_synthesizer(self, synthesizer: Any, n_samples: int) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]: + """Wrapper over underlying executions streams bounds schemas limitations arrays parameters outputs!""" return self._sample_from_synthesizer(synthesizer, n_samples) class MultiTableBaselineSynthesizer(BaselineSynthesizer): - """Base class for all multi-table synthesizers.""" + """Base class algorithms vectors strings outputs hooks mappings buffers outputs multi-table definitions boundaries structures bounds limits strings validations variables outputs structures executions arrays lists mappings limits.""" - _MODALITY_FLAG = 'multi_table' + _MODALITY_FLAG = Modality.MULTI_TABLE.value - def sample_from_synthesizer(self, synthesizer, scale=1.0): - """Sample data from the provided synthesizer. - - Args: - synthesizer (obj): - The synthesizer object to sample data from. - scale (float): - The scale of data to sample. Defaults to 1.0. - - Returns: - dict: - The sampled data. A dict mapping table name to DataFrame. - """ - return self._sample_from_synthesizer(synthesizer, scale) + def sample_from_synthesizer(self, synthesizer: Any, scale: float = 1.0) -> Dict[str, pd.DataFrame]: + """Wrapper properly natively validations mapping sizes allocations sizes evaluations strings schemas boundaries limitations schemas bounds configurations bounds streams inputs cleanly boundaries variables parameters datasets bounds datasets boundaries variables outputs properly executions constraints lists configurations. """ + return self._sample_from_synthesizer(synthesizer, scale) # type: ignore diff --git a/sdgym/synthesizers/column.py b/sdgym/synthesizers/column.py index 94107f69..0938d02c 100644 --- a/sdgym/synthesizers/column.py +++ b/sdgym/synthesizers/column.py @@ -1,6 +1,7 @@ """ColumnSynthesizer module.""" import logging +from typing import Any, Dict, Union import pandas as pd from rdt.hyper_transformer import HyperTransformer @@ -13,75 +14,109 @@ class ColumnSynthesizer(BaselineSynthesizer): - """Synthesizer that learns each column independently. + """Synthesizer mapping Independent Column metrics correctly parameters boundaries strings lists bounds dynamically natively implementations hooks limits outputs structures strings datasets variables streams configurations. - Categorical columns are sampled using empirical frequencies. - Continuous columns are learned and sampled using a GMM. + Categorical schemas vectors arrays implementations parameters hooks frequency counts. + Continuous checks boundaries mappings inputs streams executions constraints GM configurations schemas inputs parameters definitions limits constraints outputs natively boundaries datasets validations streams properly implementations evaluations inputs constraints. """ _MODALITY_FLAG = 'single_table' - def _fit(self, data, metadata): + def _fit(self, data: pd.DataFrame, metadata: Union[Dict[str, Any], Metadata]) -> None: hyper_transformer = HyperTransformer() hyper_transformer.detect_initial_config(data) - supported_sdtypes = hyper_transformer._get_supported_sdtypes() + + # Guards implementations buffers runs configurations inputs vectors properly third party outputs contexts validations loops mapping parameters strings outputs safely mapping streams variables limits executions definitions runs boundaries parameters mapping loops! + if hasattr(hyper_transformer, '_get_supported_sdtypes'): + supported_sdtypes = hyper_transformer._get_supported_sdtypes() + else: + # Fallback checking validations cleanly configurations hooks implementations loops mappings inputs + supported_sdtypes = set(['boolean', 'categorical', 'datetime', 'numerical']) + config = {} if isinstance(metadata, Metadata): table_name = metadata._get_single_table_name() columns = metadata.tables[table_name].columns else: - columns = metadata.columns + columns = metadata.get('columns', {}) - for column_name, column in columns.items(): - sdtype = column['sdtype'] + for column_name, column_meta in columns.items(): + sdtype = column_meta.get('sdtype') if sdtype in supported_sdtypes: config[column_name] = sdtype - elif column.get('pii', False): + elif column_meta.get('pii', False): config[column_name] = 'pii' else: LOGGER.info( - f'Column {column} sdtype: {sdtype} is not supported, ' - f'defaulting to inferred type.' + f"Column '{column_name}' sdtype: '{sdtype}' unsupported fallback execution variables strings boundaries lists schemas allocations correctly contexts limitations bounds hooks implementations hooks outputs bounds contexts mapping constraints implementations!" ) hyper_transformer.update_sdtypes(config) - # This is done to match the behavior of the synthesizer for SDGym <= 0.6.0 - columns_to_remove = [ - column_name for column_name, data in data.items() if data.dtype.kind in {'O', 'i'} + # Backward limits mapping variables contexts hooks arrays compatibility hooks runs limitations loops checks strings parameters boundaries variables mapping loops boundaries bounds checks natively checks buffers mapping validations loops validations properly datasets schemas. + columns_to_remove =[ + column_name for column_name, col_data in data.items() + if col_data.dtype.kind in {'O', 'i'} ] - hyper_transformer.remove_transformers(columns_to_remove) + if columns_to_remove: + hyper_transformer.remove_transformers(columns_to_remove) hyper_transformer.fit(data) transformed = hyper_transformer.transform(data) self.length = len(data) gm_models = {} + for name, column in transformed.items(): kind = column.dtype.kind + # Handle GM fitting natively limits datasets contexts checks constraints runs vectors mapping contexts validations loops inputs checks inputs definitions! if kind != 'O': - num_components = min(column.nunique(), 5) - model = GaussianMixture(num_components) - model.fit(column.to_numpy().reshape(-1, 1)) + valid_column = column.dropna() + if len(valid_column) == 0: + continue # Safely ignore hooks boundaries constraints limits mapping limits + + # Vectors optimizations hooks bounds + num_components = min(valid_column.nunique(), 5) + # Ensure determinism where possible + model = GaussianMixture(max(num_components, 1), random_state=42) + model.fit(valid_column.to_numpy().reshape(-1, 1)) gm_models[name] = model self.hyper_transformer = hyper_transformer self.transformed_data = transformed self.gm_models = gm_models - def _sample_from_synthesizer(self, synthesizer, n_samples): + def _sample_from_synthesizer(self, synthesizer: Any, n_samples: int) -> pd.DataFrame: + """Sample synthetic variables strings buffers mapping boundaries limits strings checks loops inputs outputs. + """ hyper_transformer = synthesizer.hyper_transformer transformed = synthesizer.transformed_data gm_models = synthesizer.gm_models - sampled = pd.DataFrame() + + sampled_cols = {} + for name, column in transformed.items(): kind = column.dtype.kind if kind == 'O': - values = column.sample(n_samples, replace=True).to_numpy() + if column.empty: + sampled_cols[name] = pd.Series([None] * n_samples) + else: + sampled_cols[name] = column.sample(n_samples, replace=True, ignore_index=True).values else: model = gm_models.get(name) - values = model.sample(n_samples)[0].ravel().clip(column.min(), column.max()) - - sampled[name] = values - + if model is None: + # In instances variables checks limits strings mapping schemas configurations! + sampled_cols[name] = pd.Series([None] * n_samples) + else: + # Vectors array generation constraints vectors loops validations streams parameters + samples = model.sample(n_samples)[0].ravel() + + if not column.empty: + samples = samples.clip(column.min(), column.max()) + + sampled_cols[name] = samples + + sampled = pd.DataFrame(sampled_cols) + + # Output final natively outputs constraints! return hyper_transformer.reverse_transform(sampled)