Skip to content

Commit 98645a2

Browse files
petern48HyukjinKwon
authored andcommitted
[SPARK-52791][PS] Fix error when inferring a UDT with a null first element
I modified the udt condition to check the first non-null element instead of the first element (which might be null). ``` import pyspark.pandas as ps from pyspark.ml.linalg import SparseVector sparse_values = {0: 0.1, 1: 1.1} ps_series = ps.Series([None, SparseVector(1, \{0: 1.2}), SparseVector(1, \{0: 3})]) ``` Error: ``` pyarrow.lib.ArrowInvalid: Could not convert SparseVector(1, {0: 1.2}) with type SparseVector: did not recognize Python value type when inferring an Arrow data type ``` This should work as normal, but it fails because the first element is None Yes, previously it would error, but now it works properly. This is a behavior change from all previous spark versions, and should probably be backported. Added a test No Closes #51475 from petern48/fix_infer_spark_type. Authored-by: Peter Nguyen <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]> (cherry picked from commit 5182eb4) Signed-off-by: Hyukjin Kwon <[email protected]>
1 parent ebe6ca8 commit 98645a2

File tree

2 files changed

+23
-2
lines changed

2 files changed

+23
-2
lines changed

python/pyspark/pandas/tests/data_type_ops/test_udt_ops.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,26 @@ def test_from_to_pandas(self):
129129
self.assert_eq(pser, psser._to_pandas())
130130
self.assert_eq(ps.from_pandas(pser), psser)
131131

132+
def test_with_first_null(self):
133+
lst = [None, None, None, SparseVector(1, {0: 0.1})]
134+
pser = pd.Series(lst)
135+
psser = ps.Series(lst)
136+
self.assert_eq(pser, psser._to_pandas())
137+
self.assert_eq(ps.from_pandas(pser), psser)
138+
139+
lst2 = [SparseVector(1, {0: 0.1}), None, None, None]
140+
pdf = pd.DataFrame({"a": lst, "b": lst2})
141+
psdf = ps.DataFrame({"a": lst, "b": lst2})
142+
self.assert_eq(pdf, psdf._to_pandas())
143+
self.assert_eq(ps.from_pandas(pdf), psdf)
144+
145+
def test_with_all_null(self):
146+
lst = [None, None, None, None]
147+
pser = pd.Series(lst, dtype=object)
148+
psser = ps.Series(lst, dtype=object)
149+
self.assert_eq(pser, psser._to_pandas())
150+
self.assert_eq(ps.from_pandas(pser), psser)
151+
132152
def test_isnull(self):
133153
self.assert_eq(self.pser.isnull(), self.psser.isnull())
134154

python/pyspark/pandas/typedef/typehints.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -354,8 +354,9 @@ def infer_pd_series_spark_type(
354354
if dtype == np.dtype("object"):
355355
if len(pser) == 0 or pser.isnull().all():
356356
return types.NullType()
357-
elif hasattr(pser.iloc[0], "__UDT__"):
358-
return pser.iloc[0].__UDT__
357+
notnull = pser[pser.notnull()]
358+
if hasattr(notnull.iloc[0], "__UDT__"):
359+
return notnull.iloc[0].__UDT__
359360
else:
360361
return from_arrow_type(pa.Array.from_pandas(pser).type, prefer_timestamp_ntz)
361362
elif isinstance(dtype, CategoricalDtype):

0 commit comments

Comments
 (0)