This repository was archived by the owner on May 19, 2026. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 9
Use public cudf/pylibcudf APIs for cudf backend #121
Merged
Merged
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
02c449e
Use public cudf/pylibcudf APIs for cudf backend
mroeschke 8b92cc6
Merge branch 'main' into ref/cudf/public
VibhuJawa 9aad492
Merge remote-tracking branch 'upstream/main' into ref/cudf/public
mroeschke 62af4c2
Use pylibcudf from 3D case
mroeschke 33d2759
Merge branch 'ref/cudf/public' of https://github.com/mroeschke/crossf…
mroeschke a2ccf54
Merge remote-tracking branch 'upstream/main' into ref/cudf/public
mroeschke ac4b8c3
Remove all conditional code on less than 25.06
mroeschke File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -12,136 +12,34 @@ | |||||||||
| # See the License for the specific language governing permissions and | ||||||||||
| # limitations under the License. | ||||||||||
|
|
||||||||||
| from functools import lru_cache | ||||||||||
| from typing import TYPE_CHECKING, Any, Optional | ||||||||||
|
|
||||||||||
| import cudf | ||||||||||
| import cupy as cp | ||||||||||
| import numpy as np | ||||||||||
| from cudf.core.column import as_column | ||||||||||
| from cudf.core.dtypes import ListDtype | ||||||||||
| from packaging.version import parse as parse_version | ||||||||||
|
|
||||||||||
| if TYPE_CHECKING: | ||||||||||
| from cudf.core.buffer import Buffer | ||||||||||
| from cudf.core.column import ColumnBase | ||||||||||
| from cudf.core.column.numerical import NumericalColumn | ||||||||||
|
|
||||||||||
|
|
||||||||||
| @lru_cache | ||||||||||
| def _is_cudf_gte_24_10(): | ||||||||||
| current_cudf_version = parse_version(cudf.__version__) | ||||||||||
| cudf_24_10_version = parse_version("24.10.0") | ||||||||||
|
|
||||||||||
| if current_cudf_version >= cudf_24_10_version or ( | ||||||||||
| current_cudf_version.base_version >= "24.10.0" and current_cudf_version.is_prerelease | ||||||||||
| ): | ||||||||||
| return True | ||||||||||
| elif current_cudf_version < cudf_24_10_version or ( | ||||||||||
| current_cudf_version.base_version < "24.10.0" and current_cudf_version.is_prerelease | ||||||||||
| ): | ||||||||||
| return False | ||||||||||
| else: | ||||||||||
| msg = f"Found uncaught cudf version {current_cudf_version}" | ||||||||||
| raise NotImplementedError(msg) | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def _construct_series_from_list_column(index: Any, lc: cudf.core.column.ListColumn) -> cudf.Series: | ||||||||||
| if not _is_cudf_gte_24_10(): | ||||||||||
| return cudf.Series(data=lc, index=index) | ||||||||||
| else: | ||||||||||
| from cudf.core.index import ensure_index | ||||||||||
|
|
||||||||||
| return cudf.Series._from_column(column=lc, index=ensure_index(index)) | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def _construct_list_column( | ||||||||||
| size: int, | ||||||||||
| dtype: ListDtype, | ||||||||||
| mask: Optional["Buffer"] = None, | ||||||||||
| offset: int = 0, | ||||||||||
| null_count: Optional[int] = None, | ||||||||||
| children: tuple["NumericalColumn", "ColumnBase"] = (), # type: ignore[assignment] | ||||||||||
| ) -> cudf.core.column.ListColumn: | ||||||||||
| kwargs = dict( | ||||||||||
| size=size, | ||||||||||
| dtype=dtype, | ||||||||||
| mask=mask, | ||||||||||
| offset=offset, | ||||||||||
| null_count=null_count, | ||||||||||
| children=children, | ||||||||||
| ) | ||||||||||
|
|
||||||||||
| if not _is_cudf_gte_24_10(): | ||||||||||
| return cudf.core.column.ListColumn(**kwargs) | ||||||||||
| else: | ||||||||||
| # in 24.10 ListColumn added `data` kwarg see https://github.com/rapidsai/crossfit/issues/84 | ||||||||||
| return cudf.core.column.ListColumn(data=None, **kwargs) | ||||||||||
| import pylibcudf as plc | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def create_list_series_from_1d_or_2d_ar(ar, index): | ||||||||||
| """ | ||||||||||
| Create a cudf list series from 2d arrays | ||||||||||
| """ | ||||||||||
| if len(ar.shape) == 1: | ||||||||||
| n_rows, *_ = ar.shape | ||||||||||
| n_cols = 1 | ||||||||||
| elif len(ar.shape) == 2: | ||||||||||
| n_rows, n_cols = ar.shape | ||||||||||
| else: | ||||||||||
| return RuntimeError(f"Unexpected input shape: {ar.shape}") | ||||||||||
| data = as_column(ar.flatten()) | ||||||||||
| offset_col = as_column( | ||||||||||
| cp.arange(start=0, stop=len(data) + 1, step=n_cols), dtype=np.dtype("int32") | ||||||||||
| ) | ||||||||||
| mask = cudf.Series(cp.full(shape=n_rows, fill_value=cp.bool_(True)))._column.as_mask() | ||||||||||
|
|
||||||||||
| lc = _construct_list_column( | ||||||||||
| size=n_rows, | ||||||||||
| dtype=cudf.ListDtype(data.dtype), | ||||||||||
| mask=mask, | ||||||||||
| offset=0, | ||||||||||
| null_count=0, | ||||||||||
| children=(offset_col, data), | ||||||||||
| arr = cp.asarray(ar) | ||||||||||
| if len(arr.shape) == 1: | ||||||||||
| arr = arr.reshape(-1, 1) | ||||||||||
| if not isinstance(index, (cudf.RangeIndex, cudf.Index, cudf.MultiIndex)): | ||||||||||
| index = cudf.Index(index) | ||||||||||
| return cudf.Series.from_pylibcudf( | ||||||||||
| plc.Column.from_cuda_array_interface(arr), | ||||||||||
| metadata={"index": index}, | ||||||||||
| ) | ||||||||||
| return _construct_series_from_list_column(lc=lc, index=index) | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def create_nested_list_series_from_3d_ar(ar, index): | ||||||||||
| """ | ||||||||||
| Create a cudf list of lists series from 3d arrays | ||||||||||
| """ | ||||||||||
| n_slices, n_rows, n_cols = ar.shape | ||||||||||
| flattened_data = ar.reshape(-1) # Flatten the 3-D array into 1-D | ||||||||||
|
|
||||||||||
| # Inner list offsets (for each row in 2D slices) | ||||||||||
| inner_offsets = cp.arange( | ||||||||||
| start=0, stop=n_cols * n_rows * n_slices + 1, step=n_cols, dtype="int32" | ||||||||||
| ) | ||||||||||
| inner_list_data = as_column(flattened_data) | ||||||||||
| inner_list_offsets = as_column(inner_offsets) | ||||||||||
|
|
||||||||||
| # Outer list offsets (for each 2D slice in the 3D array) | ||||||||||
| outer_offsets = cp.arange(start=0, stop=n_slices + 1, step=1, dtype="int32") * n_rows | ||||||||||
| outer_list_offsets = as_column(outer_offsets) | ||||||||||
|
|
||||||||||
| # Constructing the nested ListColumn | ||||||||||
| inner_lc = _construct_list_column( | ||||||||||
| size=inner_offsets.size - 1, | ||||||||||
| dtype=cudf.ListDtype(inner_list_data.dtype), | ||||||||||
| children=(inner_list_offsets, inner_list_data), | ||||||||||
| mask=None, | ||||||||||
| offset=0, | ||||||||||
| null_count=None, | ||||||||||
| arr = cp.asarray(ar) | ||||||||||
|
||||||||||
| arr = cp.asarray(ar) | |
| arr = cp.asarray(ar) | |
| if arr.ndim != 3: | |
| raise ValueError(f"Input array must be 3D, but got {arr.ndim}D array instead.") |
Member
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ignoring the co pilot comments as plc code should just handle other dimensions too. I think.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The function doesn’t validate inputs with more than 2 dimensions, leading to unexpected behavior for
ndim > 2. Add an explicitelif arr.ndim > 2: raise ValueError(...)to guard against unsupported shapes.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ignoring the co pilot comments , as plc code should just handle other dimensions too .
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yup, confirming the pylibcudf API can handle any arbitrary, n dimensional ndarray