[SPARK-52863][PYTHON] Clean up code paths for old pandas versions

zhengruifeng · dongjoon-hyun · commit 7bbdda2482e2 · 2025-07-18T07:29:06.000-07:00
### What changes were proposed in this pull request? Clean up code paths for old pandas versions, the minimum version is 2.2.0 now ### Why are the changes needed? code clean up ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #51550 from zhengruifeng/py_pd_220_cleanup. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py b/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py
@@ -20,7 +20,6 @@
 import numpy as np
 
 from pyspark import pandas as ps
-from pyspark.loose_version import LooseVersion
 from pyspark.pandas.config import set_option, reset_option
 from pyspark.testing.pandasutils import PandasOnSparkTestCase
 from pyspark.testing.sqlutils import SQLTestUtils
@@ -95,27 +94,15 @@ def test_corrwith(self):
         # and https://github.com/pandas-dev/pandas/pull/46174 for the initial PR that causes.
         df_bool = ps.DataFrame({"A": [True, True, False, False], "B": [True, False, False, True]})
         ser_bool = ps.Series([True, True, False, True])
-        if LooseVersion(pd.__version__) == LooseVersion("1.5.0"):
-            expected = ps.Series([0.5773502691896257, 0.5773502691896257], index=["B", "A"])
-            self.assert_eq(df_bool.corrwith(ser_bool), expected, almost=True)
-        else:
-            self._test_corrwith(df_bool, ser_bool)
+        self._test_corrwith(df_bool, ser_bool)
 
         self._test_corrwith(self.psdf1, self.psdf1)
         self._test_corrwith(self.psdf1, self.psdf2)
         self._test_corrwith(self.psdf2, self.psdf3)
         self._test_corrwith(self.psdf3, self.psdf4)
 
         self._test_corrwith(self.psdf1, self.psdf1.a)
-        # There was a regression in pandas 1.5.0, and fixed in pandas 1.5.1.
-        # Therefore, we only test the pandas 1.5.0 in different way.
-        # See https://github.com/pandas-dev/pandas/issues/49141 for the reported issue,
-        # and https://github.com/pandas-dev/pandas/pull/46174 for the initial PR that causes.
-        if LooseVersion(pd.__version__) == LooseVersion("1.5.0"):
-            expected = ps.Series([-0.08827348295047496, 0.4413674147523748], index=["b", "a"])
-            self.assert_eq(self.psdf1.corrwith(self.psdf2.b), expected, almost=True)
-        else:
-            self._test_corrwith(self.psdf1, self.psdf2.b)
+        self._test_corrwith(self.psdf1, self.psdf2.b)
 
         self._test_corrwith(self.psdf2, self.psdf3.c)
         self._test_corrwith(self.psdf3, self.psdf4.f)
diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py
@@ -21,7 +21,6 @@
 from pandas.api.types import CategoricalDtype
 
 import pyspark.pandas as ps
-from pyspark.loose_version import LooseVersion
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
 
 
@@ -202,18 +201,9 @@ def test_append(self):
         psidx3 = ps.from_pandas(pidx3)
 
         self.assert_eq(psidx1.append(psidx2), pidx1.append(pidx2))
-        if LooseVersion(pd.__version__) >= LooseVersion("1.5.0"):
-            self.assert_eq(
-                psidx1.append(psidx3.astype("category")), pidx1.append(pidx3.astype("category"))
-            )
-        else:
-            expected_result = ps.CategoricalIndex(
-                ["x", "y", "z", "y", "x", "w", "z"],
-                categories=["z", "y", "x", "w"],
-                ordered=False,
-                dtype="category",
-            )
-            self.assert_eq(psidx1.append(psidx3.astype("category")), expected_result)
+        self.assert_eq(
+            psidx1.append(psidx3.astype("category")), pidx1.append(pidx3.astype("category"))
+        )
 
         # TODO: append non-categorical or categorical with a different category
         self.assertRaises(NotImplementedError, lambda: psidx1.append(psidx3))
diff --git a/python/pyspark/pandas/tests/indexes/test_conversion.py b/python/pyspark/pandas/tests/indexes/test_conversion.py
@@ -20,7 +20,6 @@
 import pandas as pd
 
 from pyspark import pandas as ps
-from pyspark.loose_version import LooseVersion
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, SPARK_CONF_ARROW_ENABLED
 from pyspark.testing.sqlutils import SQLTestUtils
 
@@ -102,15 +101,10 @@ def test_multi_index_from_index(self):
         self.assert_eq(pmidx, psmidx)
 
         # Specify the `names`
-        # Specify the `names` while Index creating is no longer supported from pandas 2.0.0.
-        if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
-            pmidx = pd.Index(tuples)
-            pmidx.names = ["Hello", "Koalas"]
-            psmidx = ps.Index(tuples)
-            psmidx.names = ["Hello", "Koalas"]
-        else:
-            pmidx = pd.Index(tuples, names=["Hello", "Koalas"])
-            psmidx = ps.Index(tuples, names=["Hello", "Koalas"])
+        pmidx = pd.Index(tuples)
+        pmidx.names = ["Hello", "Koalas"]
+        psmidx = ps.Index(tuples)
+        psmidx.names = ["Hello", "Koalas"]
 
         self.assertTrue(isinstance(psmidx, ps.MultiIndex))
         self.assert_eq(pmidx, psmidx)
@@ -243,36 +237,30 @@ def test_to_frame(self):
         # non-string names
         self.assert_eq(psidx.to_frame(name=[10, 20]), pidx.to_frame(name=[10, 20]))
         self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10)))
-        if LooseVersion(pd.__version__) < LooseVersion("1.5.0"):
-            self.assert_eq(
-                psidx.to_frame(name=[("x", 10), ("y", 20)]),
-                pidx.to_frame(name=[("x", 10), ("y", 20)]),
-            )
-        else:
-            # Since pandas 1.5.0, the result is changed as below:
-            #      (x, 10)  (y, 20)
-            #   b
-            # 0 4        0        4
-            # 1 5        1        5
-            # 3 6        3        6
-            # 5 3        5        3
-            # 6 2        6        2
-            # 8 1        8        1
-            # 9 0        9        0
-            #   0        9        0
-            #   0        9        0
-            #
-            # The columns should be `Index([('x', 20), ('y', 20)], dtype='object')`,
-            # but pandas API on Spark doesn't support such a way for creating Index.
-            # So, we currently cannot follow the behavior of pandas.
-            expected_result = ps.DataFrame(
-                {("x", 10): [0, 1, 3, 5, 6, 8, 9, 9, 9], ("y", 20): [4, 5, 6, 3, 2, 1, 0, 0, 0]},
-                index=ps.MultiIndex.from_tuples(
-                    [(0, 4), (1, 5), (3, 6), (5, 3), (6, 2), (8, 1), (9, 0), (9, 0), (9, 0)],
-                    names=[None, "b"],
-                ),
-            )
-            self.assert_eq(psidx.to_frame(name=[("x", 10), ("y", 20)]), expected_result)
+        # Since pandas 1.5.0, the result is changed as below:
+        #      (x, 10)  (y, 20)
+        #   b
+        # 0 4        0        4
+        # 1 5        1        5
+        # 3 6        3        6
+        # 5 3        5        3
+        # 6 2        6        2
+        # 8 1        8        1
+        # 9 0        9        0
+        #   0        9        0
+        #   0        9        0
+        #
+        # The columns should be `Index([('x', 20), ('y', 20)], dtype='object')`,
+        # but pandas API on Spark doesn't support such a way for creating Index.
+        # So, we currently cannot follow the behavior of pandas.
+        expected_result = ps.DataFrame(
+            {("x", 10): [0, 1, 3, 5, 6, 8, 9, 9, 9], ("y", 20): [4, 5, 6, 3, 2, 1, 0, 0, 0]},
+            index=ps.MultiIndex.from_tuples(
+                [(0, 4), (1, 5), (3, 6), (5, 3), (6, 2), (8, 1), (9, 0), (9, 0), (9, 0)],
+                names=[None, "b"],
+            ),
+        )
+        self.assert_eq(psidx.to_frame(name=[("x", 10), ("y", 20)]), expected_result)
 
     def test_to_list(self):
         # Index
diff --git a/python/pyspark/pandas/tests/indexes/test_name.py b/python/pyspark/pandas/tests/indexes/test_name.py
@@ -20,7 +20,6 @@
 import pandas as pd
 
 from pyspark import pandas as ps
-from pyspark.loose_version import LooseVersion
 from pyspark.pandas.exceptions import PandasNotImplementedError
 from pyspark.testing.pandasutils import PandasOnSparkTestCase
 from pyspark.testing.sqlutils import SQLTestUtils
@@ -88,12 +87,6 @@ def test_index_names(self):
             psidx.name = ["renamed"]
         with self.assertRaisesRegex(TypeError, expected_error_message):
             psidx.name = ["0", "1"]
-        # Specifying `names` when creating Index is no longer supported from pandas 2.0.0.
-        if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
-            pass
-        else:
-            with self.assertRaisesRegex(TypeError, expected_error_message):
-                ps.Index([(1, 2), (3, 4)], names=["a", ["b"]])
 
     def test_multi_index_names(self):
         arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]]
diff --git a/python/pyspark/pandas/tests/indexes/test_symmetric_diff.py b/python/pyspark/pandas/tests/indexes/test_symmetric_diff.py
@@ -20,7 +20,6 @@
 import pandas as pd
 
 import pyspark.pandas as ps
-from pyspark.loose_version import LooseVersion
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
 
 
@@ -39,17 +38,10 @@ def test_index_symmetric_difference(self):
             (psidx1 + 1).symmetric_difference(psidx2).sort_values(),
             (pidx1 + 1).symmetric_difference(pidx2).sort_values(),
         )
-        # No longer supported from pandas 2.0.0.
-        if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
-            self.assert_eq(
-                (psidx1 ^ psidx2).sort_values(),
-                ps.Index([1, 5], dtype="int64"),
-            )
-        else:
-            self.assert_eq(
-                (psidx1 ^ psidx2).sort_values(),
-                (pidx1 ^ pidx2).sort_values(),
-            )
+        self.assert_eq(
+            (psidx1 ^ psidx2).sort_values(),
+            ps.Index([1, 5], dtype="int64"),
+        )
         self.assert_eq(
             psidx1.symmetric_difference(psidx2, result_name="result").sort_values(),
             pidx1.symmetric_difference(pidx2, result_name="result").sort_values(),
diff --git a/python/pyspark/pandas/tests/io/test_dataframe_spark_io.py b/python/pyspark/pandas/tests/io/test_dataframe_spark_io.py
@@ -21,7 +21,6 @@
 import pandas as pd
 
 from pyspark import pandas as ps
-from pyspark.loose_version import LooseVersion
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
 from pyspark.testing.utils import have_openpyxl, openpyxl_requirement_message
 
@@ -96,17 +95,9 @@ def test_parquet_read_with_pandas_metadata(self):
             self.assert_eq(ps.read_parquet(path2, pandas_metadata=True), expected2)
 
             expected3 = expected2.set_index("index", append=True)
-            # There is a bug in `to_parquet` from pandas 1.5.0 when writing MultiIndex.
-            # See https://github.com/pandas-dev/pandas/issues/48848 for the reported issue.
-            if LooseVersion(pd.__version__) > LooseVersion("1.5.0"):
-                expected_psdf = ps.read_parquet(path2, pandas_metadata=True).set_index(
-                    "index", append=True
-                )
-            else:
-                path3 = "{}/file3.parquet".format(tmp)
-                expected3.to_parquet(path3)
-                expected_psdf = ps.read_parquet(path3, pandas_metadata=True)
-
+            expected_psdf = ps.read_parquet(path2, pandas_metadata=True).set_index(
+                "index", append=True
+            )
             self.assert_eq(expected_psdf, expected3)
 
     def test_parquet_write(self):