apache
diff --git a/‎python/pyspark/sql/connect/session.py
Lines changed: 1 addition & 1 deletion b/‎python/pyspark/sql/connect/session.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/pyspark/sql/pandas/conversion.py
Lines changed: 1 addition & 1 deletion b/‎python/pyspark/sql/pandas/conversion.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/pyspark/sql/pandas/serializers.py
Lines changed: 83 additions & 8 deletions b/‎python/pyspark/sql/pandas/serializers.py
Lines changed: 83 additions & 8 deletions
diff --git a/‎python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py
Lines changed: 48 additions & 0 deletions b/‎python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py
Lines changed: 48 additions & 0 deletions
diff --git a/‎python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py
Lines changed: 31 additions & 0 deletions b/‎python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py
Lines changed: 31 additions & 0 deletions
diff --git a/‎python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py
Lines changed: 28 additions & 4 deletions b/‎python/pyspark/sql/tests/pandas/test_pandas_grouped_map_with_state.py
Lines changed: 28 additions & 4 deletions
@@ -619,7 +619,7 @@ def createDataFrame(
 
             safecheck = configs["spark.sql.execution.pandas.convertToArrowArraySafely"]
 
-            ser = ArrowStreamPandasSerializer(cast(str, timezone), safecheck == "true")
+            ser = ArrowStreamPandasSerializer(cast(str, timezone), safecheck == "true", False)
 
             _table = pa.Table.from_batches(
                 [
 
@@ -739,7 +739,7 @@ def _create_from_pandas_with_arrow(
         jsparkSession = self._jsparkSession
 
         safecheck = self._jconf.arrowSafeTypeConversion()
-        ser = ArrowStreamPandasSerializer(timezone, safecheck)
+        ser = ArrowStreamPandasSerializer(timezone, safecheck, False)
 
         @no_type_check
         def reader_func(temp_filename):
 
@@ -19,6 +19,7 @@
 Serializers for PyArrow and pandas conversions. See `pyspark.serializers` for more details.
 """
 
+from decimal import Decimal
 from itertools import groupby
 from typing import TYPE_CHECKING, Optional
 
@@ -251,12 +252,50 @@ class ArrowStreamPandasSerializer(ArrowStreamSerializer):
         If True, conversion from Arrow to Pandas checks for overflow/truncation
     assign_cols_by_name : bool
         If True, then Pandas DataFrames will get columns by name
+    int_to_decimal_coercion_enabled : bool
+        If True, applies additional coercions in Python before converting to Arrow
+        This has performance penalties.
     """
 
-    def __init__(self, timezone, safecheck):
+    def __init__(self, timezone, safecheck, int_to_decimal_coercion_enabled):
         super(ArrowStreamPandasSerializer, self).__init__()
         self._timezone = timezone
         self._safecheck = safecheck
+        self._int_to_decimal_coercion_enabled = int_to_decimal_coercion_enabled
+
+    @staticmethod
+    def _apply_python_coercions(series, arrow_type):
+        """
+        Apply additional coercions to the series in Python before converting to Arrow:
+        - Convert integer series to decimal type.
+          When we have a pandas series of integers that needs to be converted to
+          pyarrow.decimal128 (with precision < 20), PyArrow fails with precision errors.
+          Explicitly cast to Decimal first.
+
+        Parameters
+        ----------
+        series : pandas.Series
+            The series to potentially convert
+        arrow_type : pyarrow.DataType
+            The target arrow type
+
+        Returns
+        -------
+        pandas.Series
+            The potentially converted pandas series
+        """
+        import pyarrow.types as types
+        import pandas as pd
+
+        # Convert integer series to Decimal objects
+        if (
+            types.is_decimal(arrow_type)
+            and series.dtype.kind in ["i", "u"]  # integer types (signed/unsigned)
+            and not series.empty
+        ):
+            series = series.apply(lambda x: Decimal(x) if pd.notna(x) else None)
+
+        return series
 
     def arrow_to_pandas(
         self, arrow_column, idx, struct_in_pandas="dict", ndarray_as_list=False, spark_type=None
@@ -326,6 +365,9 @@ def _create_array(self, series, arrow_type, spark_type=None, arrow_cast=False):
             )
             series = conv(series)
 
+            if self._int_to_decimal_coercion_enabled:
+                series = self._apply_python_coercions(series, arrow_type)
+
         if hasattr(series.array, "__arrow_array__"):
             mask = None
         else:
@@ -444,8 +486,11 @@ def __init__(
         ndarray_as_list=False,
         arrow_cast=False,
         input_types=None,
+        int_to_decimal_coercion_enabled=False,
     ):
-        super(ArrowStreamPandasUDFSerializer, self).__init__(timezone, safecheck)
+        super(ArrowStreamPandasUDFSerializer, self).__init__(
+            timezone, safecheck, int_to_decimal_coercion_enabled
+        )
         self._assign_cols_by_name = assign_cols_by_name
         self._df_for_struct = df_for_struct
         self._struct_in_pandas = struct_in_pandas
@@ -799,7 +844,7 @@ class ArrowStreamPandasUDTFSerializer(ArrowStreamPandasUDFSerializer):
     Serializer used by Python worker to evaluate Arrow-optimized Python UDTFs.
     """
 
-    def __init__(self, timezone, safecheck):
+    def __init__(self, timezone, safecheck, int_to_decimal_coercion_enabled):
         super(ArrowStreamPandasUDTFSerializer, self).__init__(
             timezone=timezone,
             safecheck=safecheck,
@@ -819,6 +864,8 @@ def __init__(self, timezone, safecheck):
             ndarray_as_list=True,
             # Enables explicit casting for mismatched return types of Arrow Python UDTFs.
             arrow_cast=True,
+            # Enable additional coercions for UDTF serialization
+            int_to_decimal_coercion_enabled=int_to_decimal_coercion_enabled,
         )
         self._converter_map = dict()
 
@@ -905,6 +952,9 @@ def _create_array(self, series, arrow_type, spark_type=None, arrow_cast=False):
             conv = self._get_or_create_converter_from_pandas(dt)
             series = conv(series)
 
+            if self._int_to_decimal_coercion_enabled:
+                series = self._apply_python_coercions(series, arrow_type)
+
         if hasattr(series.array, "__arrow_array__"):
             mask = None
         else:
@@ -1036,9 +1086,13 @@ def __init__(
         state_object_schema,
         arrow_max_records_per_batch,
         prefers_large_var_types,
+        int_to_decimal_coercion_enabled,
     ):
         super(ApplyInPandasWithStateSerializer, self).__init__(
-            timezone, safecheck, assign_cols_by_name
+            timezone,
+            safecheck,
+            assign_cols_by_name,
+            int_to_decimal_coercion_enabled=int_to_decimal_coercion_enabled,
         )
         self.pickleSer = CPickleSerializer()
         self.utf8_deserializer = UTF8Deserializer()
@@ -1406,9 +1460,19 @@ class TransformWithStateInPandasSerializer(ArrowStreamPandasUDFSerializer):
         Limit of the number of records that can be written to a single ArrowRecordBatch in memory.
     """
 
-    def __init__(self, timezone, safecheck, assign_cols_by_name, arrow_max_records_per_batch):
+    def __init__(
+        self,
+        timezone,
+        safecheck,
+        assign_cols_by_name,
+        arrow_max_records_per_batch,
+        int_to_decimal_coercion_enabled,
+    ):
         super(TransformWithStateInPandasSerializer, self).__init__(
-            timezone, safecheck, assign_cols_by_name
+            timezone,
+            safecheck,
+            assign_cols_by_name,
+            int_to_decimal_coercion_enabled=int_to_decimal_coercion_enabled,
         )
         self.arrow_max_records_per_batch = arrow_max_records_per_batch
         self.key_offsets = None
@@ -1482,9 +1546,20 @@ class TransformWithStateInPandasInitStateSerializer(TransformWithStateInPandasSe
     Same as input parameters in TransformWithStateInPandasSerializer.
     """
 
-    def __init__(self, timezone, safecheck, assign_cols_by_name, arrow_max_records_per_batch):
+    def __init__(
+        self,
+        timezone,
+        safecheck,
+        assign_cols_by_name,
+        arrow_max_records_per_batch,
+        int_to_decimal_coercion_enabled,
+    ):
         super(TransformWithStateInPandasInitStateSerializer, self).__init__(
-            timezone, safecheck, assign_cols_by_name, arrow_max_records_per_batch
+            timezone,
+            safecheck,
+            assign_cols_by_name,
+            arrow_max_records_per_batch,
+            int_to_decimal_coercion_enabled,
         )
         self.init_key_offsets = None
 
 
@@ -281,6 +281,54 @@ def check_apply_in_pandas_returning_incompatible_type(self):
                         error_message_regex=expected,
                     )
 
+    def test_cogroup_apply_int_to_decimal_coercion(self):
+        left = self.data1.limit(3)
+        right = self.data2.limit(3)
+
+        def int_to_decimal_merge(lft, rgt):
+            return pd.DataFrame(
+                [
+                    {
+                        "id": 1,
+                        "decimal_result": 98765,
+                        "left_count": len(lft),
+                        "right_count": len(rgt),
+                    }
+                ]
+            )
+
+        with self.sql_conf(
+            {"spark.sql.execution.pythonUDF.pandas.intToDecimalCoercionEnabled": True}
+        ):
+            result = (
+                left.groupby("id")
+                .cogroup(right.groupby("id"))
+                .applyInPandas(
+                    int_to_decimal_merge,
+                    "id long, decimal_result decimal(10,2), left_count long, right_count long",
+                )
+                .collect()
+            )
+            self.assertTrue(len(result) > 0)
+            for row in result:
+                self.assertEqual(row.decimal_result, 98765.00)
+
+        with self.sql_conf(
+            {"spark.sql.execution.pythonUDF.pandas.intToDecimalCoercionEnabled": False}
+        ):
+            with self.assertRaisesRegex(
+                PythonException, "Exception thrown when converting pandas.Series"
+            ):
+                (
+                    left.groupby("id")
+                    .cogroup(right.groupby("id"))
+                    .applyInPandas(
+                        int_to_decimal_merge,
+                        "id long, decimal_result decimal(10,2), left_count long, right_count long",
+                    )
+                    .collect()
+                )
+
     def test_mixed_scalar_udfs_followed_by_cogrouby_apply(self):
         df = self.spark.range(0, 10).toDF("v1")
         df = df.withColumn("v2", udf(lambda x: x + 1, "int")(df["v1"])).withColumn(
 
@@ -387,6 +387,37 @@ def check_apply_in_pandas_returning_incompatible_type(self):
                             output_schema="id long, mean string",
                         )
 
+    def test_apply_in_pandas_int_to_decimal_coercion(self):
+        def int_to_decimal_func(key, pdf):
+            return pd.DataFrame([{"id": key[0], "decimal_result": 12345}])
+
+        with self.sql_conf(
+            {"spark.sql.execution.pythonUDF.pandas.intToDecimalCoercionEnabled": True}
+        ):
+            result = (
+                self.data.groupby("id")
+                .applyInPandas(int_to_decimal_func, schema="id long, decimal_result decimal(10,2)")
+                .collect()
+            )
+
+            self.assertTrue(len(result) > 0)
+            for row in result:
+                self.assertEqual(row.decimal_result, 12345.00)
+
+        with self.sql_conf(
+            {"spark.sql.execution.pythonUDF.pandas.intToDecimalCoercionEnabled": False}
+        ):
+            with self.assertRaisesRegex(
+                PythonException, "Exception thrown when converting pandas.Series"
+            ):
+                (
+                    self.data.groupby("id")
+                    .applyInPandas(
+                        int_to_decimal_func, schema="id long, decimal_result decimal(10,2)"
+                    )
+                    .collect()
+                )
+
     def test_datatype_string(self):
         df = self.data
 
 
@@ -23,6 +23,7 @@
 
 import unittest
 from typing import cast
+from decimal import Decimal
 
 from pyspark.sql.streaming.state import GroupStateTimeout, GroupState
 from pyspark.sql.types import (
@@ -31,6 +32,7 @@
     StructType,
     StructField,
     Row,
+    DecimalType,
 )
 from pyspark.testing.sqlutils import (
     ReusedSQLTestCase,
@@ -59,7 +61,12 @@ def conf(cls):
         cfg.set("spark.sql.shuffle.partitions", "5")
         return cfg
 
-    def _test_apply_in_pandas_with_state_basic(self, func, check_results):
+    def _test_apply_in_pandas_with_state_basic(self, func, check_results, output_type=None):
+        if output_type is None:
+            output_type = StructType(
+                [StructField("key", StringType()), StructField("countAsString", StringType())]
+            )
+
         input_path = tempfile.mkdtemp()
 
         def prepare_test_resource():
@@ -75,9 +82,6 @@ def prepare_test_resource():
             q.stop()
         self.assertTrue(df.isStreaming)
 
-        output_type = StructType(
-            [StructField("key", StringType()), StructField("countAsString", StringType())]
-        )
         state_type = StructType([StructField("c", LongType())])
 
         q = (
@@ -314,6 +318,26 @@ def assert_test():
         finally:
             q.stop()
 
+    def test_apply_in_pandas_with_state_int_to_decimal_coercion(self):
+        def func(key, pdf_iter, state):
+            assert isinstance(state, GroupState)
+            yield pd.DataFrame({"key": [key[0]], "decimal_sum": [1]})
+
+        def check_results(batch_df, _):
+            assert set(batch_df.sort("key").collect()) == {
+                Row(key="hello", decimal_sum=Decimal("1.00")),
+                Row(key="this", decimal_sum=Decimal("1.00")),
+            }, "Decimal coercion failed: " + str(batch_df.sort("key").collect())
+
+        output_type = StructType(
+            [StructField("key", StringType()), StructField("decimal_sum", DecimalType(10, 2))]
+        )
+
+        with self.sql_conf(
+            {"spark.sql.execution.pythonUDF.pandas.intToDecimalCoercionEnabled": True}
+        ):
+            self._test_apply_in_pandas_with_state_basic(func, check_results, output_type)
+
 
 class GroupedApplyInPandasWithStateTests(
     GroupedApplyInPandasWithStateTestsMixin, ReusedSQLTestCase
Original file line number	Diff line number	Diff line change
`@@ -619,7 +619,7 @@ def createDataFrame(`
`619`	`619`
`620`	`620`	`safecheck = configs["spark.sql.execution.pandas.convertToArrowArraySafely"]`
`621`	`621`
`622`		`- ser = ArrowStreamPandasSerializer(cast(str, timezone), safecheck == "true")`
	`622`	`+ ser = ArrowStreamPandasSerializer(cast(str, timezone), safecheck == "true", False)`
`623`	`623`
`624`	`624`	`_table = pa.Table.from_batches(`
`625`	`625`	`[`