From 625c2abb42781a02ff111f528e379dcdfb016ee0 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Sun, 13 Jul 2025 23:48:58 -0700 Subject: [PATCH 1/3] Check first non-null element for infer_pd_series_spark_type --- .../pandas/tests/data_type_ops/test_udt_ops.py | 13 +++++++++++++ python/pyspark/pandas/typedef/typehints.py | 5 +++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py index 60b4153198a34..f4e6e8f7d644a 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py @@ -130,6 +130,19 @@ def test_from_to_pandas(self): self.assert_eq(pser, psser._to_pandas()) self.assert_eq(ps.from_pandas(pser), psser) + def test_with_first_null(self): + lst = [None, None, None, SparseVector(1, {0: 0.1})] + pser = pd.Series(lst) + psser = ps.Series(lst) + self.assert_eq(pser, psser._to_pandas()) + self.assert_eq(ps.from_pandas(pser), psser) + + lst2 = [SparseVector(1, {0: 0.1}), None, None, None] + pdf = pd.DataFrame({"a": lst, "b": lst2}) + psdf = ps.DataFrame({"a": lst, "b": lst2}) + self.assert_eq(pdf, psdf._to_pandas()) + self.assert_eq(ps.from_pandas(pdf), psdf) + def test_isnull(self): self.assert_eq(self.pser.isnull(), self.psser.isnull()) diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py index 4244f5831aa50..3312ae2abf9b7 100644 --- a/python/pyspark/pandas/typedef/typehints.py +++ b/python/pyspark/pandas/typedef/typehints.py @@ -362,8 +362,9 @@ def infer_pd_series_spark_type( if dtype == np.dtype("object"): if len(pser) == 0 or pser.isnull().all(): return types.NullType() - elif hasattr(pser.iloc[0], "__UDT__"): - return pser.iloc[0].__UDT__ + first_idx = pser.first_valid_index() + if first_idx is not None and hasattr(pser.loc[first_idx], "__UDT__"): + return pser.loc[first_idx].__UDT__ else: return from_arrow_type(pa.Array.from_pandas(pser).type, prefer_timestamp_ntz) elif isinstance(dtype, CategoricalDtype): From 9a7e8c09817b0182b21872f7cc72a3d6c4efba1e Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Mon, 21 Jul 2025 23:25:17 -0700 Subject: [PATCH 2/3] Use notnull() mask instead of first_valid_index Co-authored-by: Takuya UESHIN --- python/pyspark/pandas/typedef/typehints.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py index 3312ae2abf9b7..48545d124b2d8 100644 --- a/python/pyspark/pandas/typedef/typehints.py +++ b/python/pyspark/pandas/typedef/typehints.py @@ -362,9 +362,9 @@ def infer_pd_series_spark_type( if dtype == np.dtype("object"): if len(pser) == 0 or pser.isnull().all(): return types.NullType() - first_idx = pser.first_valid_index() - if first_idx is not None and hasattr(pser.loc[first_idx], "__UDT__"): - return pser.loc[first_idx].__UDT__ + notnull = pser[pser.notnull()] + if hasattr(notnull.iloc[0], "__UDT__"): + return notnull.iloc[0].__UDT__ else: return from_arrow_type(pa.Array.from_pandas(pser).type, prefer_timestamp_ntz) elif isinstance(dtype, CategoricalDtype): From 8a3e1dec9d38d868da6d80d2593595b6b43b5b96 Mon Sep 17 00:00:00 2001 From: Peter Nguyen Date: Mon, 21 Jul 2025 23:26:29 -0700 Subject: [PATCH 3/3] Add 'test_with_all_null' --- python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py index f4e6e8f7d644a..f4f833ea9cf55 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py @@ -143,6 +143,13 @@ def test_with_first_null(self): self.assert_eq(pdf, psdf._to_pandas()) self.assert_eq(ps.from_pandas(pdf), psdf) + def test_with_all_null(self): + lst = [None, None, None, None] + pser = pd.Series(lst, dtype=object) + psser = ps.Series(lst, dtype=object) + self.assert_eq(pser, psser._to_pandas()) + self.assert_eq(ps.from_pandas(pser), psser) + def test_isnull(self): self.assert_eq(self.pser.isnull(), self.psser.isnull())