diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py index 4828b59d5ad26..0f2c8e02b18a8 100644 --- a/python/pyspark/pandas/series.py +++ b/python/pyspark/pandas/series.py @@ -3396,11 +3396,18 @@ def autocorr(self, lag: int = 1) -> float: else: lag_scol = F.lag(scol, lag).over(Window.orderBy(NATURAL_ORDER_COLUMN_NAME)) lag_col_name = verify_temp_column_name(sdf, "__autocorr_lag_tmp_col__") - corr = ( - sdf.withColumn(lag_col_name, lag_scol) - .select(F.corr(scol, F.col(lag_col_name))) - .head()[0] - ) + + sdf_lag = sdf.withColumn(lag_col_name, lag_scol) + if is_ansi_mode_enabled(sdf.sparkSession): + # Compute covariance between the original and lagged columns. + # If the covariance is None or zero (indicating no linear relationship), + # return NaN, otherwise, proceeding to compute correlation may raise + # DIVIDE_BY_ZERO under ANSI mode. + cov_value = sdf_lag.select(F.covar_samp(scol, F.col(lag_col_name))).head()[0] + if cov_value is None or cov_value == 0.0: + return np.nan + corr = sdf_lag.select(F.corr(scol, F.col(lag_col_name))).head()[0] + return np.nan if corr is None else corr def corr( diff --git a/python/pyspark/pandas/tests/series/test_stat.py b/python/pyspark/pandas/tests/series/test_stat.py index cc03e64b48869..7c92edbeea3a4 100644 --- a/python/pyspark/pandas/tests/series/test_stat.py +++ b/python/pyspark/pandas/tests/series/test_stat.py @@ -607,6 +607,9 @@ def test_autocorr(self): with self.assertRaisesRegex(TypeError, r"lag should be an int; however, got"): psser.autocorr(1.0) + psser = ps.Series([1, 0, 0, 0]) + self.assertTrue(bool(np.isnan(psser.autocorr()))) + def _test_autocorr(self, pdf): psdf = ps.from_pandas(pdf) for lag in range(-10, 10):