Skip to content

Commit 7bbdda2

Browse files
zhengruifengdongjoon-hyun
authored andcommitted
[SPARK-52863][PYTHON] Clean up code paths for old pandas versions
### What changes were proposed in this pull request? Clean up code paths for old pandas versions, the minimum version is 2.2.0 now ### Why are the changes needed? code clean up ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #51550 from zhengruifeng/py_pd_220_cleanup. Authored-by: Ruifeng Zheng <[email protected]> Signed-off-by: Dongjoon Hyun <[email protected]>
1 parent eadf1a4 commit 7bbdda2

File tree

6 files changed

+40
-99
lines changed

6 files changed

+40
-99
lines changed

python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
import numpy as np
2121

2222
from pyspark import pandas as ps
23-
from pyspark.loose_version import LooseVersion
2423
from pyspark.pandas.config import set_option, reset_option
2524
from pyspark.testing.pandasutils import PandasOnSparkTestCase
2625
from pyspark.testing.sqlutils import SQLTestUtils
@@ -95,27 +94,15 @@ def test_corrwith(self):
9594
# and https://github.com/pandas-dev/pandas/pull/46174 for the initial PR that causes.
9695
df_bool = ps.DataFrame({"A": [True, True, False, False], "B": [True, False, False, True]})
9796
ser_bool = ps.Series([True, True, False, True])
98-
if LooseVersion(pd.__version__) == LooseVersion("1.5.0"):
99-
expected = ps.Series([0.5773502691896257, 0.5773502691896257], index=["B", "A"])
100-
self.assert_eq(df_bool.corrwith(ser_bool), expected, almost=True)
101-
else:
102-
self._test_corrwith(df_bool, ser_bool)
97+
self._test_corrwith(df_bool, ser_bool)
10398

10499
self._test_corrwith(self.psdf1, self.psdf1)
105100
self._test_corrwith(self.psdf1, self.psdf2)
106101
self._test_corrwith(self.psdf2, self.psdf3)
107102
self._test_corrwith(self.psdf3, self.psdf4)
108103

109104
self._test_corrwith(self.psdf1, self.psdf1.a)
110-
# There was a regression in pandas 1.5.0, and fixed in pandas 1.5.1.
111-
# Therefore, we only test the pandas 1.5.0 in different way.
112-
# See https://github.com/pandas-dev/pandas/issues/49141 for the reported issue,
113-
# and https://github.com/pandas-dev/pandas/pull/46174 for the initial PR that causes.
114-
if LooseVersion(pd.__version__) == LooseVersion("1.5.0"):
115-
expected = ps.Series([-0.08827348295047496, 0.4413674147523748], index=["b", "a"])
116-
self.assert_eq(self.psdf1.corrwith(self.psdf2.b), expected, almost=True)
117-
else:
118-
self._test_corrwith(self.psdf1, self.psdf2.b)
105+
self._test_corrwith(self.psdf1, self.psdf2.b)
119106

120107
self._test_corrwith(self.psdf2, self.psdf3.c)
121108
self._test_corrwith(self.psdf3, self.psdf4.f)

python/pyspark/pandas/tests/indexes/test_category.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
from pandas.api.types import CategoricalDtype
2222

2323
import pyspark.pandas as ps
24-
from pyspark.loose_version import LooseVersion
2524
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
2625

2726

@@ -202,18 +201,9 @@ def test_append(self):
202201
psidx3 = ps.from_pandas(pidx3)
203202

204203
self.assert_eq(psidx1.append(psidx2), pidx1.append(pidx2))
205-
if LooseVersion(pd.__version__) >= LooseVersion("1.5.0"):
206-
self.assert_eq(
207-
psidx1.append(psidx3.astype("category")), pidx1.append(pidx3.astype("category"))
208-
)
209-
else:
210-
expected_result = ps.CategoricalIndex(
211-
["x", "y", "z", "y", "x", "w", "z"],
212-
categories=["z", "y", "x", "w"],
213-
ordered=False,
214-
dtype="category",
215-
)
216-
self.assert_eq(psidx1.append(psidx3.astype("category")), expected_result)
204+
self.assert_eq(
205+
psidx1.append(psidx3.astype("category")), pidx1.append(pidx3.astype("category"))
206+
)
217207

218208
# TODO: append non-categorical or categorical with a different category
219209
self.assertRaises(NotImplementedError, lambda: psidx1.append(psidx3))

python/pyspark/pandas/tests/indexes/test_conversion.py

Lines changed: 28 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
import pandas as pd
2121

2222
from pyspark import pandas as ps
23-
from pyspark.loose_version import LooseVersion
2423
from pyspark.testing.pandasutils import PandasOnSparkTestCase, SPARK_CONF_ARROW_ENABLED
2524
from pyspark.testing.sqlutils import SQLTestUtils
2625

@@ -102,15 +101,10 @@ def test_multi_index_from_index(self):
102101
self.assert_eq(pmidx, psmidx)
103102

104103
# Specify the `names`
105-
# Specify the `names` while Index creating is no longer supported from pandas 2.0.0.
106-
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
107-
pmidx = pd.Index(tuples)
108-
pmidx.names = ["Hello", "Koalas"]
109-
psmidx = ps.Index(tuples)
110-
psmidx.names = ["Hello", "Koalas"]
111-
else:
112-
pmidx = pd.Index(tuples, names=["Hello", "Koalas"])
113-
psmidx = ps.Index(tuples, names=["Hello", "Koalas"])
104+
pmidx = pd.Index(tuples)
105+
pmidx.names = ["Hello", "Koalas"]
106+
psmidx = ps.Index(tuples)
107+
psmidx.names = ["Hello", "Koalas"]
114108

115109
self.assertTrue(isinstance(psmidx, ps.MultiIndex))
116110
self.assert_eq(pmidx, psmidx)
@@ -243,36 +237,30 @@ def test_to_frame(self):
243237
# non-string names
244238
self.assert_eq(psidx.to_frame(name=[10, 20]), pidx.to_frame(name=[10, 20]))
245239
self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10)))
246-
if LooseVersion(pd.__version__) < LooseVersion("1.5.0"):
247-
self.assert_eq(
248-
psidx.to_frame(name=[("x", 10), ("y", 20)]),
249-
pidx.to_frame(name=[("x", 10), ("y", 20)]),
250-
)
251-
else:
252-
# Since pandas 1.5.0, the result is changed as below:
253-
# (x, 10) (y, 20)
254-
# b
255-
# 0 4 0 4
256-
# 1 5 1 5
257-
# 3 6 3 6
258-
# 5 3 5 3
259-
# 6 2 6 2
260-
# 8 1 8 1
261-
# 9 0 9 0
262-
# 0 9 0
263-
# 0 9 0
264-
#
265-
# The columns should be `Index([('x', 20), ('y', 20)], dtype='object')`,
266-
# but pandas API on Spark doesn't support such a way for creating Index.
267-
# So, we currently cannot follow the behavior of pandas.
268-
expected_result = ps.DataFrame(
269-
{("x", 10): [0, 1, 3, 5, 6, 8, 9, 9, 9], ("y", 20): [4, 5, 6, 3, 2, 1, 0, 0, 0]},
270-
index=ps.MultiIndex.from_tuples(
271-
[(0, 4), (1, 5), (3, 6), (5, 3), (6, 2), (8, 1), (9, 0), (9, 0), (9, 0)],
272-
names=[None, "b"],
273-
),
274-
)
275-
self.assert_eq(psidx.to_frame(name=[("x", 10), ("y", 20)]), expected_result)
240+
# Since pandas 1.5.0, the result is changed as below:
241+
# (x, 10) (y, 20)
242+
# b
243+
# 0 4 0 4
244+
# 1 5 1 5
245+
# 3 6 3 6
246+
# 5 3 5 3
247+
# 6 2 6 2
248+
# 8 1 8 1
249+
# 9 0 9 0
250+
# 0 9 0
251+
# 0 9 0
252+
#
253+
# The columns should be `Index([('x', 20), ('y', 20)], dtype='object')`,
254+
# but pandas API on Spark doesn't support such a way for creating Index.
255+
# So, we currently cannot follow the behavior of pandas.
256+
expected_result = ps.DataFrame(
257+
{("x", 10): [0, 1, 3, 5, 6, 8, 9, 9, 9], ("y", 20): [4, 5, 6, 3, 2, 1, 0, 0, 0]},
258+
index=ps.MultiIndex.from_tuples(
259+
[(0, 4), (1, 5), (3, 6), (5, 3), (6, 2), (8, 1), (9, 0), (9, 0), (9, 0)],
260+
names=[None, "b"],
261+
),
262+
)
263+
self.assert_eq(psidx.to_frame(name=[("x", 10), ("y", 20)]), expected_result)
276264

277265
def test_to_list(self):
278266
# Index

python/pyspark/pandas/tests/indexes/test_name.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
import pandas as pd
2121

2222
from pyspark import pandas as ps
23-
from pyspark.loose_version import LooseVersion
2423
from pyspark.pandas.exceptions import PandasNotImplementedError
2524
from pyspark.testing.pandasutils import PandasOnSparkTestCase
2625
from pyspark.testing.sqlutils import SQLTestUtils
@@ -88,12 +87,6 @@ def test_index_names(self):
8887
psidx.name = ["renamed"]
8988
with self.assertRaisesRegex(TypeError, expected_error_message):
9089
psidx.name = ["0", "1"]
91-
# Specifying `names` when creating Index is no longer supported from pandas 2.0.0.
92-
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
93-
pass
94-
else:
95-
with self.assertRaisesRegex(TypeError, expected_error_message):
96-
ps.Index([(1, 2), (3, 4)], names=["a", ["b"]])
9790

9891
def test_multi_index_names(self):
9992
arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]

python/pyspark/pandas/tests/indexes/test_symmetric_diff.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
import pandas as pd
2121

2222
import pyspark.pandas as ps
23-
from pyspark.loose_version import LooseVersion
2423
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
2524

2625

@@ -39,17 +38,10 @@ def test_index_symmetric_difference(self):
3938
(psidx1 + 1).symmetric_difference(psidx2).sort_values(),
4039
(pidx1 + 1).symmetric_difference(pidx2).sort_values(),
4140
)
42-
# No longer supported from pandas 2.0.0.
43-
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
44-
self.assert_eq(
45-
(psidx1 ^ psidx2).sort_values(),
46-
ps.Index([1, 5], dtype="int64"),
47-
)
48-
else:
49-
self.assert_eq(
50-
(psidx1 ^ psidx2).sort_values(),
51-
(pidx1 ^ pidx2).sort_values(),
52-
)
41+
self.assert_eq(
42+
(psidx1 ^ psidx2).sort_values(),
43+
ps.Index([1, 5], dtype="int64"),
44+
)
5345
self.assert_eq(
5446
psidx1.symmetric_difference(psidx2, result_name="result").sort_values(),
5547
pidx1.symmetric_difference(pidx2, result_name="result").sort_values(),

python/pyspark/pandas/tests/io/test_dataframe_spark_io.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
import pandas as pd
2222

2323
from pyspark import pandas as ps
24-
from pyspark.loose_version import LooseVersion
2524
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
2625
from pyspark.testing.utils import have_openpyxl, openpyxl_requirement_message
2726

@@ -96,17 +95,9 @@ def test_parquet_read_with_pandas_metadata(self):
9695
self.assert_eq(ps.read_parquet(path2, pandas_metadata=True), expected2)
9796

9897
expected3 = expected2.set_index("index", append=True)
99-
# There is a bug in `to_parquet` from pandas 1.5.0 when writing MultiIndex.
100-
# See https://github.com/pandas-dev/pandas/issues/48848 for the reported issue.
101-
if LooseVersion(pd.__version__) > LooseVersion("1.5.0"):
102-
expected_psdf = ps.read_parquet(path2, pandas_metadata=True).set_index(
103-
"index", append=True
104-
)
105-
else:
106-
path3 = "{}/file3.parquet".format(tmp)
107-
expected3.to_parquet(path3)
108-
expected_psdf = ps.read_parquet(path3, pandas_metadata=True)
109-
98+
expected_psdf = ps.read_parquet(path2, pandas_metadata=True).set_index(
99+
"index", append=True
100+
)
110101
self.assert_eq(expected_psdf, expected3)
111102

112103
def test_parquet_write(self):

0 commit comments

Comments
 (0)