Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ __pycache__/

# virtual environments
/venv/
/.venv/

# uv lock file (this is a library; lock files are for applications)
/uv.lock

# Claude Code
/.claude

# PyCharm project files
/.idea
Expand Down
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,16 @@ simply `spotfire`) to include the required Python packages to support optional f
| `spotfire[plot-matplotlib]` | Plotting support using just `matplotlib` |
| `spotfire[plot-pil]` | Plotting support using just `Pillow` |
| `spotfire[plot-seaborn]` | Plotting support using just `seaborn` |
| `spotfire[polars]` | Polars DataFrame support |
| `spotfire[dev,lint]` | Internal development |

Once installed, `export_data()` accepts `polars.DataFrame` and `polars.Series` directly, and
`import_data()` can return a `polars.DataFrame` via `output_format="polars"`.

> **Note for Spotfire data functions:** Spotfire's bundled Python interpreter does not include
> Polars. To use Polars inside a data function, configure Spotfire to use a custom Python
> environment that has `polars` installed. Polars is a large binary package (~44 MB), so
> Spotfire Packages (SPKs) that bundle it will be significantly larger than typical packages.

### License
BSD-type 3-Clause License. See the file ```LICENSE``` included in the package.
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,13 @@ plot-seaborn = [
"seaborn >= 0.13.2",
]
plot = [ "spotfire[plot-matplotlib,plot-pil,plot-seaborn]" ]
# Polars support
polars = [
"polars >= 0.20.0",
]
# Development requirements
dev = [
"spotfire[geo,plot]",
"spotfire[geo,plot,polars]",
"Cython >= 3.0.4",
"html-testRunner",
]
Expand Down
6 changes: 5 additions & 1 deletion spotfire/sbdf.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ class SBDFError(Exception): ...
class SBDFWarning(Warning): ...

def spotfire_typename_to_valuetype_id(typename: str) -> typing.Optional[int]: ...
def import_data(sbdf_file: _FilenameLike): ...
@typing.overload
def import_data(sbdf_file: _FilenameLike, output_format: typing.Literal["pandas"] = ...) -> pd.DataFrame: ...
@typing.overload
def import_data(sbdf_file: _FilenameLike, output_format: typing.Literal["polars"]) -> typing.Any: ...
def import_data(sbdf_file: _FilenameLike, output_format: str = "pandas") -> typing.Any: ...
def export_data(obj: typing.Any, sbdf_file: _FilenameLike, default_column_name: str = "x",
rows_per_slice: int = 0, encoding_rle: bool = True) -> None: ...
257 changes: 253 additions & 4 deletions spotfire/sbdf.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ try:
except ImportError:
PIL = None

try:
import polars as pl
except ImportError:
pl = None


# Various utility helper functions for doing things that are problematic in PYX files
include "sbdf_helpers.pxi"
Expand Down Expand Up @@ -420,6 +425,18 @@ cdef class _ImportContext:
"""
return _valuetype_id_to_spotfire_typename(self.value_type.id)

cpdef bint is_object_numpy_type(self):
"""Return True if the numpy type for this column is NPY_OBJECT.

:return: True if the numpy type is object, False otherwise

.. note:: ``numpy_type_num`` is a ``cdef`` attribute and is therefore inaccessible from
Python-side ``cdef object`` functions. This ``cpdef`` wrapper exposes it so that
:func:`_import_build_polars_dataframe` can branch on it without touching the
Cython-only attribute directly.
"""
return self.numpy_type_num == np_c.NPY_OBJECT


# Individual functions for importing each value type.
ctypedef int(*importer_fn)(_ImportContext, sbdf_c.sbdf_columnslice*)
Expand Down Expand Up @@ -654,13 +671,87 @@ cdef dict _import_metadata(sbdf_c.sbdf_metadata_head* md, int column_num):
return metadata


def import_data(sbdf_file):
"""Import data from an SBDF file and create a 'pandas' DataFrame.
cdef object _import_polars_dtype(_ImportContext context):
"""Return the Polars dtype corresponding to the SBDF value type in the import context.

:param context: import context for a column
:return: the Polars dtype object
"""
vt_id = context.value_type.id
if vt_id == sbdf_c.SBDF_BOOLTYPEID:
return pl.Boolean
elif vt_id == sbdf_c.SBDF_INTTYPEID:
return pl.Int32
elif vt_id == sbdf_c.SBDF_LONGTYPEID:
return pl.Int64
elif vt_id == sbdf_c.SBDF_FLOATTYPEID:
return pl.Float32
elif vt_id == sbdf_c.SBDF_DOUBLETYPEID:
return pl.Float64
elif vt_id == sbdf_c.SBDF_STRINGTYPEID:
return pl.Utf8
elif vt_id == sbdf_c.SBDF_DATETIMETYPEID:
return pl.Datetime
elif vt_id == sbdf_c.SBDF_DATETYPEID:
return pl.Date
elif vt_id == sbdf_c.SBDF_TIMETYPEID:
return pl.Time
elif vt_id == sbdf_c.SBDF_TIMESPANTYPEID:
return pl.Duration
elif vt_id == sbdf_c.SBDF_BINARYTYPEID:
return pl.Binary
elif vt_id == sbdf_c.SBDF_DECIMALTYPEID:
return pl.Decimal
else:
raise SBDFError(f"unsupported SBDF value type id {vt_id} for Polars output")


cdef object _import_build_polars_dataframe(column_names, importer_contexts):
"""Build a Polars DataFrame directly from import context data, with no Pandas intermediary.

:param column_names: list of column name strings
:param importer_contexts: list of _ImportContext objects
:return: a Polars DataFrame
"""
series_list = []
for i, name in enumerate(column_names):
context = importer_contexts[i]
values = context.get_values_array()
invalids = context.get_invalid_array()
polars_dtype = _import_polars_dtype(context)

if context.is_object_numpy_type():
# Object arrays hold Python objects (str, date, datetime, etc.); Polars cannot
# construct a typed series from a numpy object array directly — use a Python list.
values_list = values.tolist()
if invalids.any():
for idx in np.where(invalids)[0]:
values_list[idx] = None
col = pl.Series(name=name, values=values_list, dtype=polars_dtype)
else:
# Numeric arrays: numpy → Polars Series directly, then scatter nulls if needed.
col = pl.Series(name=name, values=values, dtype=polars_dtype)
if invalids.any():
indices = np.where(invalids)[0].tolist()
col = col.scatter(indices, None)

series_list.append(col)

return pl.DataFrame(series_list)


def import_data(sbdf_file, output_format="pandas"):
"""Import data from an SBDF file and create a DataFrame.

:param sbdf_file: the filename of the SBDF file to import
:param output_format: the format of the returned DataFrame; either 'pandas' (default) or 'polars'
:return: the DataFrame containing the imported data
:raises SBDFError: if a problem is encountered during import
"""
# Validate output_format before opening the file so we fail fast on bad input.
if output_format not in ("pandas", "polars"):
raise SBDFError(f"unknown output_format {output_format!r}; expected 'pandas' or 'polars'")

cdef int error, i
cdef stdio.FILE* input_file = NULL
cdef int major_v, minor_v
Expand Down Expand Up @@ -774,7 +865,16 @@ def import_data(sbdf_file):
if error != sbdf_c.SBDF_OK and error != sbdf_c.SBDF_TABLEEND:
raise SBDFError(f"error reading '{sbdf_file}': {sbdf_c.sbdf_err_get_str(error).decode('utf-8')}")

# Build a new DataFrame with the results
# Short-circuit before pd.concat to avoid the Pandas intermediary entirely.
# This keeps the import zero-copy for large DataFrames: numpy arrays collected
# by each _ImportContext go straight into Polars Series without ever becoming
# a Pandas DataFrame.
if output_format == "polars":
if pl is None:
raise SBDFError("polars is not installed; install it with 'pip install spotfire[polars]'")
return _import_build_polars_dataframe(column_names, importer_contexts)

# Build a new Pandas DataFrame with the results
imported_columns = []
for i in range(num_columns):
column_series = pd.Series(importer_contexts[i].get_values_array(),
Expand Down Expand Up @@ -1030,6 +1130,149 @@ cdef _export_obj_series(obj, default_column_name):
return {}, [column_name], [column_metadata], [context]


cdef int _export_infer_valuetype_from_polars_dtype(dtype, series_description):
"""Determine a value type for a data set based on the Polars dtype for the series.

:param dtype: the Polars dtype object
:param series_description: description of series (for error reporting)
:return: the integer value type id representing the type of series
:raise SBDFError: if the dtype is unknown
"""
# Use __class__.__name__ rather than isinstance() checks. Polars dtype objects are
# not ordinary Python classes resolvable at Cython compile time, so isinstance() would
# require importing the exact dtype class — which breaks when Polars isn't installed.
# Class name strings are stable across the Polars versions we support (>= 0.20).
dtype_name = dtype.__class__.__name__
if dtype_name == "Boolean":
return sbdf_c.SBDF_BOOLTYPEID
elif dtype_name in ("Int8", "Int16", "Int32", "UInt8", "UInt16"):
return sbdf_c.SBDF_INTTYPEID
elif dtype_name in ("Int64", "UInt32", "UInt64"):
if dtype_name == "UInt64":
warnings.warn(f"Polars UInt64 type in {series_description} will be exported as LongInteger (signed "
f"64-bit); values above 9,223,372,036,854,775,807 will overflow", SBDFWarning)
return sbdf_c.SBDF_LONGTYPEID
elif dtype_name == "Float32":
return sbdf_c.SBDF_FLOATTYPEID
elif dtype_name == "Float64":
return sbdf_c.SBDF_DOUBLETYPEID
elif dtype_name in ("Utf8", "String"):
return sbdf_c.SBDF_STRINGTYPEID
elif dtype_name == "Date":
return sbdf_c.SBDF_DATETYPEID
elif dtype_name == "Datetime":
if getattr(dtype, 'time_zone', None) is not None:
warnings.warn(f"Polars Datetime type in {series_description} has timezone '{dtype.time_zone}'; "
f"timezone information will not be preserved in SBDF", SBDFWarning)
return sbdf_c.SBDF_DATETIMETYPEID
elif dtype_name == "Duration":
return sbdf_c.SBDF_TIMESPANTYPEID
elif dtype_name == "Time":
return sbdf_c.SBDF_TIMETYPEID
elif dtype_name == "Binary":
return sbdf_c.SBDF_BINARYTYPEID
elif dtype_name == "Decimal":
warnings.warn(f"Polars Decimal type in {series_description} export is experimental; "
f"precision may not be fully preserved", SBDFWarning)
return sbdf_c.SBDF_DECIMALTYPEID
elif dtype_name in ("Categorical", "Enum"):
# SBDF has no categorical type; export as String
warnings.warn(f"Polars {dtype_name} type in {series_description} will be exported as String; "
f"category information will not be preserved", SBDFWarning)
return sbdf_c.SBDF_STRINGTYPEID
elif dtype_name == "Null":
# pl.Series([None, None]) has dtype Null when no type can be inferred. Export as
# String; _export_polars_series_to_numpy produces a placeholder array and the
# invalids mask marks every row missing, so the stored values are never read.
return sbdf_c.SBDF_STRINGTYPEID
else:
raise SBDFError(f"unknown Polars dtype '{dtype_name}' in {series_description}")


cdef np_c.ndarray _export_polars_series_to_numpy(_ExportContext context, series):
"""Convert a Polars Series to a NumPy array suitable for the SBDF exporter.

:param context: export context holding the resolved value type
:param series: Polars Series to convert
:return: NumPy ndarray of values
"""
dtype_name = series.dtype.__class__.__name__
if dtype_name == "Null":
# A Null-dtype series has no values to convert; return a same-length placeholder array.
# The invalids mask (set by the caller via series.is_null()) marks every row as missing,
# so the placeholder values are never read by the SBDF writer.
return np.full(len(series), None, dtype=object)
if dtype_name in ("Categorical", "Enum"):
# Cast to String so .to_numpy() returns plain Python strings
series = series.cast(pl.Utf8)
dtype_name = "Utf8"
if dtype_name in ("Date", "Time"):
# The Date/Time exporters require Python date/time objects;
# Polars .to_numpy() returns numpy datetime64/int64 which those exporters do not accept.
return np.asarray(series.to_list(), dtype=object)
if dtype_name in ("Datetime", "Duration"):
# Keep native datetime64/timedelta64 arrays; the invalids mask handles nulls (NaT cells
# are marked invalid and ignored by the SBDF writer). Boxing to object would be slower.
return series.to_numpy(allow_copy=True)
na_value = context.get_numpy_na_value()
if na_value is not None:
return np.asarray(series.fill_null(na_value).to_numpy(allow_copy=True),
dtype=context.get_numpy_dtype())
else:
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_export_polars_series_to_numpy converts to an object ndarray when na_value is None. For Polars Datetime / Duration series, to_numpy() already produces datetime64 / timedelta64 arrays that the existing SBDF exporters can handle, so forcing dtype=object will box scalars and create an unnecessary copy (hurting the performance goal of this PR). Consider special-casing datetime/timespan to keep the native NumPy dtype (ideally normalized to the SBDF-supported resolution) instead of casting to object.

Suggested change
else:
else:
# For Datetime/Duration, keep native NumPy datetime64/timedelta64 dtypes instead of boxing to object.
if dtype_name in ("Datetime", "Duration"):
return series.to_numpy(allow_copy=True)

Copilot uses AI. Check for mistakes.
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

datetime and duration go to numpy early now.

return np.asarray(series.to_numpy(allow_copy=True), dtype=object)


cdef _export_obj_polars_dataframe(obj):
"""Extract column information for a Polars ``DataFrame``.

:param obj: Polars DataFrame object to export
:return: tuple containing dictionary of table metadata, list of column names, list of dictionaries of column
metadata, and list of export context objects
"""
if len(set(obj.columns)) != len(obj.columns):
raise SBDFError("obj does not have unique column names")

column_names = []
column_metadata = []
exporter_contexts = []
for col in obj.columns:
series = obj[col]
column_names.append(col)
context = _ExportContext()
context.set_valuetype_id(_export_infer_valuetype_from_polars_dtype(series.dtype, f"column '{col}'"))
if series.dtype.__class__.__name__ in ("Float32", "Float64"):
invalids = (series.is_null() | series.is_nan()).to_numpy()
else:
invalids = series.is_null().to_numpy()
context.set_arrays(_export_polars_series_to_numpy(context, series), invalids)
column_metadata.append({})
Comment on lines +1242 to +1248
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the Polars export path, invalids are derived from series.is_null(), which does not mark floating-point NaN values as invalid. In the existing Pandas path pd.isnull() treats NaN as missing, so exporting a Polars float column containing NaN will write NaNs as real values instead of SBDF invalids (behavior mismatch vs Pandas and likely incorrect for Spotfire missing-values semantics). Consider treating NaN as invalid for Float32/Float64 columns (e.g., combine is_null() with is_nan() when applicable).

Copilot uses AI. Check for mistakes.
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added series.is_null() and series.is_nan() for floats to handle like pandas does.

exporter_contexts.append(context)

return {}, column_names, column_metadata, exporter_contexts


cdef _export_obj_polars_series(obj, default_column_name):
"""Extract column information for a Polars ``Series``.

:param obj: Polars Series object to export
:param default_column_name: column name to use when obj does not have a name
:return: tuple containing dict of table metadata, list of column names, list of dicts of column metadata, and
list of export context objects
"""
column_name = obj.name if obj.name else default_column_name
description = f"series '{obj.name}'" if obj.name else "series"

context = _ExportContext()
context.set_valuetype_id(_export_infer_valuetype_from_polars_dtype(obj.dtype, description))
if obj.dtype.__class__.__name__ in ("Float32", "Float64"):
invalids = (obj.is_null() | obj.is_nan()).to_numpy()
else:
invalids = obj.is_null().to_numpy()
context.set_arrays(_export_polars_series_to_numpy(context, obj), invalids)

return {}, [column_name], [{}], [context]


cdef _export_obj_numpy(np_c.ndarray obj, default_column_name):
"""Extract column information for a NumPy ``ndarray``.

Expand Down Expand Up @@ -1801,8 +2044,14 @@ def export_data(obj, sbdf_file, default_column_name="x", Py_ssize_t rows_per_sli

try:
# Extract data and metadata from obj
# Polars DataFrames (tabular)
if pl is not None and isinstance(obj, pl.DataFrame):
exported = _export_obj_polars_dataframe(obj)
# Polars Series (columnar)
elif pl is not None and isinstance(obj, pl.Series):
exported = _export_obj_polars_series(obj, default_column_name)
# Pandas DataFrames (tabular)
if isinstance(obj, pd.DataFrame):
elif isinstance(obj, pd.DataFrame):
exported = _export_obj_dataframe(obj)
# Pandas Series (columnar)
elif isinstance(obj, pd.Series):
Expand Down
Loading