Skip to content
Draft
32 changes: 24 additions & 8 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3432,7 +3432,6 @@ def unpivot(
array_value, type="cross"
)
new_passthrough_cols = [column_mapping[col] for col in passthrough_columns]
# Last column is offsets
index_col_ids = [labels_mapping[col] for col in labels_array.column_ids[:-1]]
explode_offsets_id = labels_mapping[labels_array.column_ids[-1]]

Expand All @@ -3442,6 +3441,10 @@ def unpivot(
for input_ids in unpivot_columns:
# row explode offset used to choose the input column
# we use offset instead of label as labels are not necessarily unique
if not input_ids:
unpivot_exprs.append(ex.const(None))
continue

cases = itertools.chain(
*(
(
Expand All @@ -3459,7 +3462,7 @@ def unpivot(
joined_array, unpivot_col_ids = joined_array.compute_values(unpivot_exprs)

return joined_array.select_columns(
[*index_col_ids, *unpivot_col_ids, *new_passthrough_cols]
[*index_col_ids, *unpivot_col_ids, *new_passthrough_cols], allow_renames=True
), (tuple(index_col_ids), tuple(unpivot_col_ids), tuple(new_passthrough_cols))


Expand All @@ -3471,18 +3474,31 @@ def _pd_index_to_array_value(
Create an ArrayValue from a list of label tuples.
The last column will be row offsets.
"""
id_gen = bigframes.core.identifiers.standard_id_strings()
index_ids = [next(id_gen) for _ in range(index.nlevels)]
offset_id = next(id_gen)

rows = []
labels_as_tuples = utils.index_as_tuples(index)
for row_offset in range(len(index)):
id_gen = bigframes.core.identifiers.standard_id_strings()
row_label = labels_as_tuples[row_offset]
row_label = (row_label,) if not isinstance(row_label, tuple) else row_label
row = {}
for label_part, id in zip(row_label, id_gen):
row[id] = label_part if pd.notnull(label_part) else None
row[next(id_gen)] = row_offset
row = {
id: (val if pd.notnull(val) else None)
for id, val in zip(index_ids, row_label)
}
row[offset_id] = row_offset
rows.append(row)

if not rows:
# Create empty table with correct columns
schema = pa.schema(
[pa.field(id, pa.null()) for id in index_ids]
+ [pa.field(offset_id, pa.int64())]
)
return core.ArrayValue.from_pyarrow(
pa.Table.from_batches([], schema=schema), session=session
)

return core.ArrayValue.from_pyarrow(pa.Table.from_pylist(rows), session=session)


Expand Down
33 changes: 21 additions & 12 deletions bigframes/pandas/core/methods/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import typing

import pandas as pd
import pyarrow as pa

from bigframes import dataframe, dtypes, series
from bigframes.core import agg_expressions, blocks
Expand Down Expand Up @@ -86,9 +87,13 @@ def _describe(
if include != "all" and dtype not in _DEFAULT_DTYPES:
continue
agg_ops = _get_aggs_for_dtype(dtype)
stats.extend(op.as_expr(col_id) for op in agg_ops)
label_tuple = (label,) if block.column_labels.nlevels == 1 else label
column_labels.extend((*label_tuple, op.name) for op in agg_ops) # type: ignore

label_tuple = (
(label,) if block.column_labels.nlevels == 1 else typing.cast(tuple, label)
)
for op in agg_ops:
stats.append(op.as_expr(col_id))
column_labels.append((*label_tuple, op.name))

agg_block = block.aggregate(
by_column_ids=by_col_ids,
Expand All @@ -100,7 +105,7 @@ def _describe(


def _get_aggs_for_dtype(dtype) -> list[aggregations.UnaryAggregateOp]:
if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE:
if dtypes.is_numeric(dtype, include_bool=False):
return [
aggregations.count_op,
aggregations.mean_op,
Expand All @@ -111,14 +116,18 @@ def _get_aggs_for_dtype(dtype) -> list[aggregations.UnaryAggregateOp]:
aggregations.ApproxQuartilesOp(3),
aggregations.max_op,
]
elif dtype in dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES:
elif dtypes.is_datetime_like(dtype) or dtypes.is_date_like(dtype):
return [aggregations.count_op]
elif dtype in [
dtypes.STRING_DTYPE,
dtypes.BOOL_DTYPE,
dtypes.BYTES_DTYPE,
dtypes.TIME_DTYPE,
]:
elif (
dtypes.is_string_like(dtype)
or dtypes.is_binary_like(dtype)
or dtypes.is_time_like(dtype)
or (
isinstance(dtype, pd.ArrowDtype)
and pa.types.is_struct(dtype.pyarrow_dtype)
and not dtypes.contains_db_dtypes_json_dtype(dtype)
)
):
return [aggregations.count_op, aggregations.nunique_op]
else:
return []
return [aggregations.count_op]
79 changes: 79 additions & 0 deletions tests/unit/core/test_blocks_unpivot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from unittest import mock

import pandas as pd
import pytest

from bigframes.core import blocks


@pytest.fixture
def mock_session():
session = mock.MagicMock()
session.bqclient = None
return session


def test_pd_index_to_array_value_with_empty_index_creates_no_columns(mock_session):
"""
Tests that `_pd_index_to_array_value` with an empty pandas Index creates
an ArrayValue with the expected number of columns (index level + offset).
"""
empty_index = pd.Index([], name="test")

array_val = blocks._pd_index_to_array_value(mock_session, empty_index)

# 1 index level + 1 offset column
assert len(array_val.column_ids) == 2


def test_pd_index_to_array_value_with_empty_multiindex_creates_no_columns(mock_session):
"""
Tests that `_pd_index_to_array_value` with an empty pandas MultiIndex creates
an ArrayValue with the expected number of columns (index levels + offset).
"""
empty_index = pd.MultiIndex.from_arrays([[], []], names=["a", "b"])

array_val = blocks._pd_index_to_array_value(mock_session, empty_index)

# 2 index levels + 1 offset column
assert len(array_val.column_ids) == 3


def test_unpivot_with_empty_row_labels(mock_session):
"""
Tests that `unpivot` handles an empty `row_labels` index correctly by producing 0 rows.
"""
import pyarrow as pa

# Create a dummy ArrayValue
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
pa_table = pa.Table.from_pandas(df)
array_value = blocks.core.ArrayValue.from_pyarrow(pa_table, session=mock_session)

# Call unpivot with an empty pd.Index
unpivot_result, (index_cols, value_cols, passthrough_cols) = blocks.unpivot(
array_value,
row_labels=pd.Index([]),
unpivot_columns=[("a",)],
passthrough_columns=["b"],
)

# The expected behavior is that the unpivot operation produces 0 rows.
assert unpivot_result is not array_value
assert index_cols == ("col_0",)
assert len(value_cols) == 1
assert passthrough_cols == ("b",)
Loading