diff --git a/bigframes/core/logging/__init__.py b/bigframes/core/logging/__init__.py index 95c077a99a..5d06124efc 100644 --- a/bigframes/core/logging/__init__.py +++ b/bigframes/core/logging/__init__.py @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from bigframes.core.logging import log_adapter +from bigframes.core.logging import data_types, log_adapter -__all__ = ["log_adapter"] +__all__ = ["log_adapter", "data_types"] diff --git a/bigframes/core/logging/data_types.py b/bigframes/core/logging/data_types.py new file mode 100644 index 0000000000..db99b1a020 --- /dev/null +++ b/bigframes/core/logging/data_types.py @@ -0,0 +1,65 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from bigframes import dtypes + + +def _add_data_type(existing_types: int, curr_type: dtypes.Dtype) -> int: + return existing_types | _get_dtype_mask(curr_type) + + +def _get_dtype_mask(dtype: dtypes.Dtype) -> int: + if dtype == dtypes.INT_DTYPE: + return 1 << 1 + if dtype == dtypes.FLOAT_DTYPE: + return 1 << 2 + if dtype == dtypes.BOOL_DTYPE: + return 1 << 3 + if dtype == dtypes.STRING_DTYPE: + return 1 << 4 + if dtype == dtypes.BYTES_DTYPE: + return 1 << 5 + if dtype == dtypes.DATE_DTYPE: + return 1 << 6 + if dtype == dtypes.TIME_DTYPE: + return 1 << 7 + if dtype == dtypes.DATETIME_DTYPE: + return 1 << 8 + if dtype == dtypes.TIMESTAMP_DTYPE: + return 1 << 9 + if dtype == dtypes.TIMEDELTA_DTYPE: + return 1 << 10 + if dtype == dtypes.NUMERIC_DTYPE: + return 1 << 11 + if dtype == dtypes.BIGNUMERIC_DTYPE: + return 1 << 12 + if dtype == dtypes.GEO_DTYPE: + return 1 << 13 + if dtype == dtypes.JSON_DTYPE: + return 1 << 14 + + if dtypes.is_struct_like(dtype): + mask = 1 << 15 + if dtype == dtypes.OBJ_REF_DTYPE: + # obj_ref is a special struct type for multi-modal data. + # It should be double counted as both "struct" and its own type. + mask = mask | (1 << 17) + return mask + + if dtypes.is_array_like(dtype): + return 1 << 16 + + # If an unknown datat type is present, mark it with the least significant bit. + return 1 << 0 diff --git a/tests/unit/core/logging/test_data_types.py b/tests/unit/core/logging/test_data_types.py new file mode 100644 index 0000000000..9e3d1f1ed0 --- /dev/null +++ b/tests/unit/core/logging/test_data_types.py @@ -0,0 +1,69 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pyarrow as pa +import pytest + +from bigframes import dtypes +from bigframes.core.logging import data_types + +UNKNOWN_TYPE = pd.ArrowDtype(pa.time64("ns")) + +PA_STRUCT_TYPE = pa.struct([("city", pa.string()), ("pop", pa.int64())]) + +PA_LIST_TYPE = pa.list_(pa.int64()) + + +@pytest.mark.parametrize( + ("dtype", "expected_mask"), + [ + (UNKNOWN_TYPE, 1 << 0), + (dtypes.INT_DTYPE, 1 << 1), + (dtypes.FLOAT_DTYPE, 1 << 2), + (dtypes.BOOL_DTYPE, 1 << 3), + (dtypes.STRING_DTYPE, 1 << 4), + (dtypes.BYTES_DTYPE, 1 << 5), + (dtypes.DATE_DTYPE, 1 << 6), + (dtypes.TIME_DTYPE, 1 << 7), + (dtypes.DATETIME_DTYPE, 1 << 8), + (dtypes.TIMESTAMP_DTYPE, 1 << 9), + (dtypes.TIMEDELTA_DTYPE, 1 << 10), + (dtypes.NUMERIC_DTYPE, 1 << 11), + (dtypes.BIGNUMERIC_DTYPE, 1 << 12), + (dtypes.GEO_DTYPE, 1 << 13), + (dtypes.JSON_DTYPE, 1 << 14), + (pd.ArrowDtype(PA_STRUCT_TYPE), 1 << 15), + (pd.ArrowDtype(PA_LIST_TYPE), 1 << 16), + (dtypes.OBJ_REF_DTYPE, (1 << 15) | (1 << 17)), + ], +) +def test_get_dtype_mask(dtype, expected_mask): + assert data_types._get_dtype_mask(dtype) == expected_mask + + +def test_add_data_type__type_overlap_no_op(): + curr_type = dtypes.STRING_DTYPE + existing_types = data_types._get_dtype_mask(curr_type) + + assert data_types._add_data_type(existing_types, curr_type) == existing_types + + +def test_add_data_type__new_type_updated(): + curr_type = dtypes.STRING_DTYPE + existing_types = data_types._get_dtype_mask(dtypes.INT_DTYPE) + + assert data_types._add_data_type( + existing_types, curr_type + ) == existing_types | data_types._get_dtype_mask(curr_type)