diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py b/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py index 8f9948dbb757c..fe7d1dafabd2f 100644 --- a/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py +++ b/python/pyspark/pandas/tests/diff_frames_ops/test_corrwith.py @@ -20,7 +20,6 @@ import numpy as np from pyspark import pandas as ps -from pyspark.loose_version import LooseVersion from pyspark.pandas.config import set_option, reset_option from pyspark.testing.pandasutils import PandasOnSparkTestCase from pyspark.testing.sqlutils import SQLTestUtils @@ -95,11 +94,7 @@ def test_corrwith(self): # and https://github.com/pandas-dev/pandas/pull/46174 for the initial PR that causes. df_bool = ps.DataFrame({"A": [True, True, False, False], "B": [True, False, False, True]}) ser_bool = ps.Series([True, True, False, True]) - if LooseVersion(pd.__version__) == LooseVersion("1.5.0"): - expected = ps.Series([0.5773502691896257, 0.5773502691896257], index=["B", "A"]) - self.assert_eq(df_bool.corrwith(ser_bool), expected, almost=True) - else: - self._test_corrwith(df_bool, ser_bool) + self._test_corrwith(df_bool, ser_bool) self._test_corrwith(self.psdf1, self.psdf1) self._test_corrwith(self.psdf1, self.psdf2) @@ -107,15 +102,7 @@ def test_corrwith(self): self._test_corrwith(self.psdf3, self.psdf4) self._test_corrwith(self.psdf1, self.psdf1.a) - # There was a regression in pandas 1.5.0, and fixed in pandas 1.5.1. - # Therefore, we only test the pandas 1.5.0 in different way. - # See https://github.com/pandas-dev/pandas/issues/49141 for the reported issue, - # and https://github.com/pandas-dev/pandas/pull/46174 for the initial PR that causes. - if LooseVersion(pd.__version__) == LooseVersion("1.5.0"): - expected = ps.Series([-0.08827348295047496, 0.4413674147523748], index=["b", "a"]) - self.assert_eq(self.psdf1.corrwith(self.psdf2.b), expected, almost=True) - else: - self._test_corrwith(self.psdf1, self.psdf2.b) + self._test_corrwith(self.psdf1, self.psdf2.b) self._test_corrwith(self.psdf2, self.psdf3.c) self._test_corrwith(self.psdf3, self.psdf4.f) diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py index 761e1100d8afa..acd80378333e8 100644 --- a/python/pyspark/pandas/tests/indexes/test_category.py +++ b/python/pyspark/pandas/tests/indexes/test_category.py @@ -21,7 +21,6 @@ from pandas.api.types import CategoricalDtype import pyspark.pandas as ps -from pyspark.loose_version import LooseVersion from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils @@ -202,18 +201,9 @@ def test_append(self): psidx3 = ps.from_pandas(pidx3) self.assert_eq(psidx1.append(psidx2), pidx1.append(pidx2)) - if LooseVersion(pd.__version__) >= LooseVersion("1.5.0"): - self.assert_eq( - psidx1.append(psidx3.astype("category")), pidx1.append(pidx3.astype("category")) - ) - else: - expected_result = ps.CategoricalIndex( - ["x", "y", "z", "y", "x", "w", "z"], - categories=["z", "y", "x", "w"], - ordered=False, - dtype="category", - ) - self.assert_eq(psidx1.append(psidx3.astype("category")), expected_result) + self.assert_eq( + psidx1.append(psidx3.astype("category")), pidx1.append(pidx3.astype("category")) + ) # TODO: append non-categorical or categorical with a different category self.assertRaises(NotImplementedError, lambda: psidx1.append(psidx3)) diff --git a/python/pyspark/pandas/tests/indexes/test_conversion.py b/python/pyspark/pandas/tests/indexes/test_conversion.py index 9759a3d06a759..8b6726f27d988 100644 --- a/python/pyspark/pandas/tests/indexes/test_conversion.py +++ b/python/pyspark/pandas/tests/indexes/test_conversion.py @@ -20,7 +20,6 @@ import pandas as pd from pyspark import pandas as ps -from pyspark.loose_version import LooseVersion from pyspark.testing.pandasutils import PandasOnSparkTestCase, SPARK_CONF_ARROW_ENABLED from pyspark.testing.sqlutils import SQLTestUtils @@ -102,15 +101,10 @@ def test_multi_index_from_index(self): self.assert_eq(pmidx, psmidx) # Specify the `names` - # Specify the `names` while Index creating is no longer supported from pandas 2.0.0. - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - pmidx = pd.Index(tuples) - pmidx.names = ["Hello", "Koalas"] - psmidx = ps.Index(tuples) - psmidx.names = ["Hello", "Koalas"] - else: - pmidx = pd.Index(tuples, names=["Hello", "Koalas"]) - psmidx = ps.Index(tuples, names=["Hello", "Koalas"]) + pmidx = pd.Index(tuples) + pmidx.names = ["Hello", "Koalas"] + psmidx = ps.Index(tuples) + psmidx.names = ["Hello", "Koalas"] self.assertTrue(isinstance(psmidx, ps.MultiIndex)) self.assert_eq(pmidx, psmidx) @@ -243,36 +237,30 @@ def test_to_frame(self): # non-string names self.assert_eq(psidx.to_frame(name=[10, 20]), pidx.to_frame(name=[10, 20])) self.assert_eq(psidx.to_frame(name=("x", 10)), pidx.to_frame(name=("x", 10))) - if LooseVersion(pd.__version__) < LooseVersion("1.5.0"): - self.assert_eq( - psidx.to_frame(name=[("x", 10), ("y", 20)]), - pidx.to_frame(name=[("x", 10), ("y", 20)]), - ) - else: - # Since pandas 1.5.0, the result is changed as below: - # (x, 10) (y, 20) - # b - # 0 4 0 4 - # 1 5 1 5 - # 3 6 3 6 - # 5 3 5 3 - # 6 2 6 2 - # 8 1 8 1 - # 9 0 9 0 - # 0 9 0 - # 0 9 0 - # - # The columns should be `Index([('x', 20), ('y', 20)], dtype='object')`, - # but pandas API on Spark doesn't support such a way for creating Index. - # So, we currently cannot follow the behavior of pandas. - expected_result = ps.DataFrame( - {("x", 10): [0, 1, 3, 5, 6, 8, 9, 9, 9], ("y", 20): [4, 5, 6, 3, 2, 1, 0, 0, 0]}, - index=ps.MultiIndex.from_tuples( - [(0, 4), (1, 5), (3, 6), (5, 3), (6, 2), (8, 1), (9, 0), (9, 0), (9, 0)], - names=[None, "b"], - ), - ) - self.assert_eq(psidx.to_frame(name=[("x", 10), ("y", 20)]), expected_result) + # Since pandas 1.5.0, the result is changed as below: + # (x, 10) (y, 20) + # b + # 0 4 0 4 + # 1 5 1 5 + # 3 6 3 6 + # 5 3 5 3 + # 6 2 6 2 + # 8 1 8 1 + # 9 0 9 0 + # 0 9 0 + # 0 9 0 + # + # The columns should be `Index([('x', 20), ('y', 20)], dtype='object')`, + # but pandas API on Spark doesn't support such a way for creating Index. + # So, we currently cannot follow the behavior of pandas. + expected_result = ps.DataFrame( + {("x", 10): [0, 1, 3, 5, 6, 8, 9, 9, 9], ("y", 20): [4, 5, 6, 3, 2, 1, 0, 0, 0]}, + index=ps.MultiIndex.from_tuples( + [(0, 4), (1, 5), (3, 6), (5, 3), (6, 2), (8, 1), (9, 0), (9, 0), (9, 0)], + names=[None, "b"], + ), + ) + self.assert_eq(psidx.to_frame(name=[("x", 10), ("y", 20)]), expected_result) def test_to_list(self): # Index diff --git a/python/pyspark/pandas/tests/indexes/test_name.py b/python/pyspark/pandas/tests/indexes/test_name.py index cacf3efcb38b1..d7e3bf1786e8a 100644 --- a/python/pyspark/pandas/tests/indexes/test_name.py +++ b/python/pyspark/pandas/tests/indexes/test_name.py @@ -20,7 +20,6 @@ import pandas as pd from pyspark import pandas as ps -from pyspark.loose_version import LooseVersion from pyspark.pandas.exceptions import PandasNotImplementedError from pyspark.testing.pandasutils import PandasOnSparkTestCase from pyspark.testing.sqlutils import SQLTestUtils @@ -88,12 +87,6 @@ def test_index_names(self): psidx.name = ["renamed"] with self.assertRaisesRegex(TypeError, expected_error_message): psidx.name = ["0", "1"] - # Specifying `names` when creating Index is no longer supported from pandas 2.0.0. - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - pass - else: - with self.assertRaisesRegex(TypeError, expected_error_message): - ps.Index([(1, 2), (3, 4)], names=["a", ["b"]]) def test_multi_index_names(self): arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] diff --git a/python/pyspark/pandas/tests/indexes/test_symmetric_diff.py b/python/pyspark/pandas/tests/indexes/test_symmetric_diff.py index 2eca8cf4a4312..310fea2035926 100644 --- a/python/pyspark/pandas/tests/indexes/test_symmetric_diff.py +++ b/python/pyspark/pandas/tests/indexes/test_symmetric_diff.py @@ -20,7 +20,6 @@ import pandas as pd import pyspark.pandas as ps -from pyspark.loose_version import LooseVersion from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils @@ -39,17 +38,10 @@ def test_index_symmetric_difference(self): (psidx1 + 1).symmetric_difference(psidx2).sort_values(), (pidx1 + 1).symmetric_difference(pidx2).sort_values(), ) - # No longer supported from pandas 2.0.0. - if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"): - self.assert_eq( - (psidx1 ^ psidx2).sort_values(), - ps.Index([1, 5], dtype="int64"), - ) - else: - self.assert_eq( - (psidx1 ^ psidx2).sort_values(), - (pidx1 ^ pidx2).sort_values(), - ) + self.assert_eq( + (psidx1 ^ psidx2).sort_values(), + ps.Index([1, 5], dtype="int64"), + ) self.assert_eq( psidx1.symmetric_difference(psidx2, result_name="result").sort_values(), pidx1.symmetric_difference(pidx2, result_name="result").sort_values(), diff --git a/python/pyspark/pandas/tests/io/test_dataframe_spark_io.py b/python/pyspark/pandas/tests/io/test_dataframe_spark_io.py index af77ea8aa64ff..065a0e8d6ecd5 100644 --- a/python/pyspark/pandas/tests/io/test_dataframe_spark_io.py +++ b/python/pyspark/pandas/tests/io/test_dataframe_spark_io.py @@ -21,7 +21,6 @@ import pandas as pd from pyspark import pandas as ps -from pyspark.loose_version import LooseVersion from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils from pyspark.testing.utils import have_openpyxl, openpyxl_requirement_message @@ -96,17 +95,9 @@ def test_parquet_read_with_pandas_metadata(self): self.assert_eq(ps.read_parquet(path2, pandas_metadata=True), expected2) expected3 = expected2.set_index("index", append=True) - # There is a bug in `to_parquet` from pandas 1.5.0 when writing MultiIndex. - # See https://github.com/pandas-dev/pandas/issues/48848 for the reported issue. - if LooseVersion(pd.__version__) > LooseVersion("1.5.0"): - expected_psdf = ps.read_parquet(path2, pandas_metadata=True).set_index( - "index", append=True - ) - else: - path3 = "{}/file3.parquet".format(tmp) - expected3.to_parquet(path3) - expected_psdf = ps.read_parquet(path3, pandas_metadata=True) - + expected_psdf = ps.read_parquet(path2, pandas_metadata=True).set_index( + "index", append=True + ) self.assert_eq(expected_psdf, expected3) def test_parquet_write(self):