-
Notifications
You must be signed in to change notification settings - Fork 8
Add native Polars DataFrame support #99
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
4868db9
82492e5
0030291
cef9107
1bd2198
6761de0
441cddb
a0a86ce
bf8e984
00d81cf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -55,6 +55,11 @@ try: | |||||||||||
| except ImportError: | ||||||||||||
| PIL = None | ||||||||||||
|
|
||||||||||||
| try: | ||||||||||||
| import polars as pl | ||||||||||||
| except ImportError: | ||||||||||||
| pl = None | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| # Various utility helper functions for doing things that are problematic in PYX files | ||||||||||||
| include "sbdf_helpers.pxi" | ||||||||||||
|
|
@@ -420,6 +425,18 @@ cdef class _ImportContext: | |||||||||||
| """ | ||||||||||||
| return _valuetype_id_to_spotfire_typename(self.value_type.id) | ||||||||||||
|
|
||||||||||||
| cpdef bint is_object_numpy_type(self): | ||||||||||||
| """Return True if the numpy type for this column is NPY_OBJECT. | ||||||||||||
|
|
||||||||||||
| :return: True if the numpy type is object, False otherwise | ||||||||||||
|
|
||||||||||||
| .. note:: ``numpy_type_num`` is a ``cdef`` attribute and is therefore inaccessible from | ||||||||||||
| Python-side ``cdef object`` functions. This ``cpdef`` wrapper exposes it so that | ||||||||||||
| :func:`_import_build_polars_dataframe` can branch on it without touching the | ||||||||||||
| Cython-only attribute directly. | ||||||||||||
| """ | ||||||||||||
| return self.numpy_type_num == np_c.NPY_OBJECT | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| # Individual functions for importing each value type. | ||||||||||||
| ctypedef int(*importer_fn)(_ImportContext, sbdf_c.sbdf_columnslice*) | ||||||||||||
|
|
@@ -654,13 +671,87 @@ cdef dict _import_metadata(sbdf_c.sbdf_metadata_head* md, int column_num): | |||||||||||
| return metadata | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| def import_data(sbdf_file): | ||||||||||||
| """Import data from an SBDF file and create a 'pandas' DataFrame. | ||||||||||||
| cdef object _import_polars_dtype(_ImportContext context): | ||||||||||||
| """Return the Polars dtype corresponding to the SBDF value type in the import context. | ||||||||||||
|
|
||||||||||||
| :param context: import context for a column | ||||||||||||
| :return: the Polars dtype object | ||||||||||||
| """ | ||||||||||||
| vt_id = context.value_type.id | ||||||||||||
| if vt_id == sbdf_c.SBDF_BOOLTYPEID: | ||||||||||||
| return pl.Boolean | ||||||||||||
| elif vt_id == sbdf_c.SBDF_INTTYPEID: | ||||||||||||
| return pl.Int32 | ||||||||||||
| elif vt_id == sbdf_c.SBDF_LONGTYPEID: | ||||||||||||
| return pl.Int64 | ||||||||||||
| elif vt_id == sbdf_c.SBDF_FLOATTYPEID: | ||||||||||||
| return pl.Float32 | ||||||||||||
| elif vt_id == sbdf_c.SBDF_DOUBLETYPEID: | ||||||||||||
| return pl.Float64 | ||||||||||||
| elif vt_id == sbdf_c.SBDF_STRINGTYPEID: | ||||||||||||
| return pl.Utf8 | ||||||||||||
| elif vt_id == sbdf_c.SBDF_DATETIMETYPEID: | ||||||||||||
| return pl.Datetime | ||||||||||||
| elif vt_id == sbdf_c.SBDF_DATETYPEID: | ||||||||||||
| return pl.Date | ||||||||||||
| elif vt_id == sbdf_c.SBDF_TIMETYPEID: | ||||||||||||
| return pl.Time | ||||||||||||
| elif vt_id == sbdf_c.SBDF_TIMESPANTYPEID: | ||||||||||||
| return pl.Duration | ||||||||||||
| elif vt_id == sbdf_c.SBDF_BINARYTYPEID: | ||||||||||||
| return pl.Binary | ||||||||||||
| elif vt_id == sbdf_c.SBDF_DECIMALTYPEID: | ||||||||||||
| return pl.Decimal | ||||||||||||
| else: | ||||||||||||
| raise SBDFError(f"unsupported SBDF value type id {vt_id} for Polars output") | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| cdef object _import_build_polars_dataframe(column_names, importer_contexts): | ||||||||||||
| """Build a Polars DataFrame directly from import context data, with no Pandas intermediary. | ||||||||||||
|
|
||||||||||||
| :param column_names: list of column name strings | ||||||||||||
| :param importer_contexts: list of _ImportContext objects | ||||||||||||
| :return: a Polars DataFrame | ||||||||||||
| """ | ||||||||||||
| series_list = [] | ||||||||||||
| for i, name in enumerate(column_names): | ||||||||||||
| context = importer_contexts[i] | ||||||||||||
| values = context.get_values_array() | ||||||||||||
| invalids = context.get_invalid_array() | ||||||||||||
| polars_dtype = _import_polars_dtype(context) | ||||||||||||
|
|
||||||||||||
| if context.is_object_numpy_type(): | ||||||||||||
| # Object arrays hold Python objects (str, date, datetime, etc.); Polars cannot | ||||||||||||
| # construct a typed series from a numpy object array directly — use a Python list. | ||||||||||||
| values_list = values.tolist() | ||||||||||||
| if invalids.any(): | ||||||||||||
| for idx in np.where(invalids)[0]: | ||||||||||||
| values_list[idx] = None | ||||||||||||
| col = pl.Series(name=name, values=values_list, dtype=polars_dtype) | ||||||||||||
| else: | ||||||||||||
| # Numeric arrays: numpy → Polars Series directly, then scatter nulls if needed. | ||||||||||||
| col = pl.Series(name=name, values=values, dtype=polars_dtype) | ||||||||||||
| if invalids.any(): | ||||||||||||
| indices = np.where(invalids)[0].tolist() | ||||||||||||
| col = col.scatter(indices, None) | ||||||||||||
|
|
||||||||||||
| series_list.append(col) | ||||||||||||
|
|
||||||||||||
| return pl.DataFrame(series_list) | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| def import_data(sbdf_file, output_format="pandas"): | ||||||||||||
| """Import data from an SBDF file and create a DataFrame. | ||||||||||||
|
|
||||||||||||
| :param sbdf_file: the filename of the SBDF file to import | ||||||||||||
| :param output_format: the format of the returned DataFrame; either 'pandas' (default) or 'polars' | ||||||||||||
| :return: the DataFrame containing the imported data | ||||||||||||
| :raises SBDFError: if a problem is encountered during import | ||||||||||||
| """ | ||||||||||||
| # Validate output_format before opening the file so we fail fast on bad input. | ||||||||||||
| if output_format not in ("pandas", "polars"): | ||||||||||||
| raise SBDFError(f"unknown output_format {output_format!r}; expected 'pandas' or 'polars'") | ||||||||||||
|
|
||||||||||||
| cdef int error, i | ||||||||||||
| cdef stdio.FILE* input_file = NULL | ||||||||||||
| cdef int major_v, minor_v | ||||||||||||
|
|
@@ -774,7 +865,16 @@ def import_data(sbdf_file): | |||||||||||
| if error != sbdf_c.SBDF_OK and error != sbdf_c.SBDF_TABLEEND: | ||||||||||||
| raise SBDFError(f"error reading '{sbdf_file}': {sbdf_c.sbdf_err_get_str(error).decode('utf-8')}") | ||||||||||||
|
|
||||||||||||
| # Build a new DataFrame with the results | ||||||||||||
| # Short-circuit before pd.concat to avoid the Pandas intermediary entirely. | ||||||||||||
| # This keeps the import zero-copy for large DataFrames: numpy arrays collected | ||||||||||||
| # by each _ImportContext go straight into Polars Series without ever becoming | ||||||||||||
| # a Pandas DataFrame. | ||||||||||||
| if output_format == "polars": | ||||||||||||
| if pl is None: | ||||||||||||
| raise SBDFError("polars is not installed; install it with 'pip install spotfire[polars]'") | ||||||||||||
| return _import_build_polars_dataframe(column_names, importer_contexts) | ||||||||||||
|
|
||||||||||||
| # Build a new Pandas DataFrame with the results | ||||||||||||
| imported_columns = [] | ||||||||||||
| for i in range(num_columns): | ||||||||||||
| column_series = pd.Series(importer_contexts[i].get_values_array(), | ||||||||||||
|
|
@@ -1030,6 +1130,149 @@ cdef _export_obj_series(obj, default_column_name): | |||||||||||
| return {}, [column_name], [column_metadata], [context] | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| cdef int _export_infer_valuetype_from_polars_dtype(dtype, series_description): | ||||||||||||
| """Determine a value type for a data set based on the Polars dtype for the series. | ||||||||||||
|
|
||||||||||||
| :param dtype: the Polars dtype object | ||||||||||||
| :param series_description: description of series (for error reporting) | ||||||||||||
| :return: the integer value type id representing the type of series | ||||||||||||
| :raise SBDFError: if the dtype is unknown | ||||||||||||
| """ | ||||||||||||
| # Use __class__.__name__ rather than isinstance() checks. Polars dtype objects are | ||||||||||||
| # not ordinary Python classes resolvable at Cython compile time, so isinstance() would | ||||||||||||
| # require importing the exact dtype class — which breaks when Polars isn't installed. | ||||||||||||
| # Class name strings are stable across the Polars versions we support (>= 0.20). | ||||||||||||
| dtype_name = dtype.__class__.__name__ | ||||||||||||
| if dtype_name == "Boolean": | ||||||||||||
| return sbdf_c.SBDF_BOOLTYPEID | ||||||||||||
| elif dtype_name in ("Int8", "Int16", "Int32", "UInt8", "UInt16"): | ||||||||||||
| return sbdf_c.SBDF_INTTYPEID | ||||||||||||
| elif dtype_name in ("Int64", "UInt32", "UInt64"): | ||||||||||||
| if dtype_name == "UInt64": | ||||||||||||
| warnings.warn(f"Polars UInt64 type in {series_description} will be exported as LongInteger (signed " | ||||||||||||
| f"64-bit); values above 9,223,372,036,854,775,807 will overflow", SBDFWarning) | ||||||||||||
| return sbdf_c.SBDF_LONGTYPEID | ||||||||||||
| elif dtype_name == "Float32": | ||||||||||||
| return sbdf_c.SBDF_FLOATTYPEID | ||||||||||||
| elif dtype_name == "Float64": | ||||||||||||
| return sbdf_c.SBDF_DOUBLETYPEID | ||||||||||||
| elif dtype_name in ("Utf8", "String"): | ||||||||||||
| return sbdf_c.SBDF_STRINGTYPEID | ||||||||||||
| elif dtype_name == "Date": | ||||||||||||
| return sbdf_c.SBDF_DATETYPEID | ||||||||||||
| elif dtype_name == "Datetime": | ||||||||||||
| if getattr(dtype, 'time_zone', None) is not None: | ||||||||||||
| warnings.warn(f"Polars Datetime type in {series_description} has timezone '{dtype.time_zone}'; " | ||||||||||||
| f"timezone information will not be preserved in SBDF", SBDFWarning) | ||||||||||||
| return sbdf_c.SBDF_DATETIMETYPEID | ||||||||||||
| elif dtype_name == "Duration": | ||||||||||||
| return sbdf_c.SBDF_TIMESPANTYPEID | ||||||||||||
| elif dtype_name == "Time": | ||||||||||||
| return sbdf_c.SBDF_TIMETYPEID | ||||||||||||
| elif dtype_name == "Binary": | ||||||||||||
| return sbdf_c.SBDF_BINARYTYPEID | ||||||||||||
| elif dtype_name == "Decimal": | ||||||||||||
| warnings.warn(f"Polars Decimal type in {series_description} export is experimental; " | ||||||||||||
| f"precision may not be fully preserved", SBDFWarning) | ||||||||||||
| return sbdf_c.SBDF_DECIMALTYPEID | ||||||||||||
| elif dtype_name in ("Categorical", "Enum"): | ||||||||||||
| # SBDF has no categorical type; export as String | ||||||||||||
| warnings.warn(f"Polars {dtype_name} type in {series_description} will be exported as String; " | ||||||||||||
| f"category information will not be preserved", SBDFWarning) | ||||||||||||
| return sbdf_c.SBDF_STRINGTYPEID | ||||||||||||
| elif dtype_name == "Null": | ||||||||||||
| # pl.Series([None, None]) has dtype Null when no type can be inferred. Export as | ||||||||||||
| # String; _export_polars_series_to_numpy produces a placeholder array and the | ||||||||||||
| # invalids mask marks every row missing, so the stored values are never read. | ||||||||||||
| return sbdf_c.SBDF_STRINGTYPEID | ||||||||||||
| else: | ||||||||||||
| raise SBDFError(f"unknown Polars dtype '{dtype_name}' in {series_description}") | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
| cdef np_c.ndarray _export_polars_series_to_numpy(_ExportContext context, series): | ||||||||||||
| """Convert a Polars Series to a NumPy array suitable for the SBDF exporter. | ||||||||||||
|
|
||||||||||||
| :param context: export context holding the resolved value type | ||||||||||||
| :param series: Polars Series to convert | ||||||||||||
| :return: NumPy ndarray of values | ||||||||||||
| """ | ||||||||||||
| dtype_name = series.dtype.__class__.__name__ | ||||||||||||
| if dtype_name == "Null": | ||||||||||||
| # A Null-dtype series has no values to convert; return a same-length placeholder array. | ||||||||||||
| # The invalids mask (set by the caller via series.is_null()) marks every row as missing, | ||||||||||||
| # so the placeholder values are never read by the SBDF writer. | ||||||||||||
| return np.full(len(series), None, dtype=object) | ||||||||||||
| if dtype_name in ("Categorical", "Enum"): | ||||||||||||
| # Cast to String so .to_numpy() returns plain Python strings | ||||||||||||
| series = series.cast(pl.Utf8) | ||||||||||||
| dtype_name = "Utf8" | ||||||||||||
| if dtype_name in ("Date", "Time"): | ||||||||||||
| # The Date/Time exporters require Python date/time objects; | ||||||||||||
| # Polars .to_numpy() returns numpy datetime64/int64 which those exporters do not accept. | ||||||||||||
| return np.asarray(series.to_list(), dtype=object) | ||||||||||||
| if dtype_name in ("Datetime", "Duration"): | ||||||||||||
| # Keep native datetime64/timedelta64 arrays; the invalids mask handles nulls (NaT cells | ||||||||||||
| # are marked invalid and ignored by the SBDF writer). Boxing to object would be slower. | ||||||||||||
| return series.to_numpy(allow_copy=True) | ||||||||||||
| na_value = context.get_numpy_na_value() | ||||||||||||
| if na_value is not None: | ||||||||||||
| return np.asarray(series.fill_null(na_value).to_numpy(allow_copy=True), | ||||||||||||
| dtype=context.get_numpy_dtype()) | ||||||||||||
| else: | ||||||||||||
|
||||||||||||
| else: | |
| else: | |
| # For Datetime/Duration, keep native NumPy datetime64/timedelta64 dtypes instead of boxing to object. | |
| if dtype_name in ("Datetime", "Duration"): | |
| return series.to_numpy(allow_copy=True) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
datetime and duration go to numpy early now.
Copilot
AI
Mar 24, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the Polars export path, invalids are derived from series.is_null(), which does not mark floating-point NaN values as invalid. In the existing Pandas path pd.isnull() treats NaN as missing, so exporting a Polars float column containing NaN will write NaNs as real values instead of SBDF invalids (behavior mismatch vs Pandas and likely incorrect for Spotfire missing-values semantics). Consider treating NaN as invalid for Float32/Float64 columns (e.g., combine is_null() with is_nan() when applicable).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added series.is_null() and series.is_nan() for floats to handle like pandas does.
Uh oh!
There was an error while loading. Please reload this page.