apache
diff --git a/‎core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala‎
Lines changed: 2 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/pyspark/sql/pandas/_typing/__init__.pyi‎
Lines changed: 1 addition & 0 deletions b/‎python/pyspark/sql/pandas/_typing/__init__.pyi‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/pyspark/sql/pandas/functions.py‎
Lines changed: 67 additions & 0 deletions b/‎python/pyspark/sql/pandas/functions.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎python/pyspark/sql/pandas/functions.pyi‎
Lines changed: 2 additions & 0 deletions b/‎python/pyspark/sql/pandas/functions.pyi‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/pyspark/sql/pandas/serializers.py‎
Lines changed: 52 additions & 0 deletions b/‎python/pyspark/sql/pandas/serializers.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎python/pyspark/sql/pandas/typehints.py‎
Lines changed: 47 additions & 1 deletion b/‎python/pyspark/sql/pandas/typehints.py‎
Lines changed: 47 additions & 1 deletion
@@ -75,6 +75,7 @@ private[spark] object PythonEvalType {
   val SQL_SCALAR_ARROW_ITER_UDF = 251
   val SQL_GROUPED_AGG_ARROW_UDF = 252
   val SQL_WINDOW_AGG_ARROW_UDF = 253
+  val SQL_GROUPED_AGG_ARROW_ITER_UDF = 254
 
   val SQL_TABLE_UDF = 300
   val SQL_ARROW_TABLE_UDF = 301
@@ -112,6 +113,7 @@ private[spark] object PythonEvalType {
     case SQL_SCALAR_ARROW_ITER_UDF => "SQL_SCALAR_ARROW_ITER_UDF"
     case SQL_GROUPED_AGG_ARROW_UDF => "SQL_GROUPED_AGG_ARROW_UDF"
     case SQL_WINDOW_AGG_ARROW_UDF => "SQL_WINDOW_AGG_ARROW_UDF"
+    case SQL_GROUPED_AGG_ARROW_ITER_UDF => "SQL_GROUPED_AGG_ARROW_ITER_UDF"
   }
 }
 
 
@@ -66,6 +66,7 @@ ArrowScalarUDFType = Literal[250]
 ArrowScalarIterUDFType = Literal[251]
 ArrowGroupedAggUDFType = Literal[252]
 ArrowWindowAggUDFType = Literal[253]
+ArrowGroupedAggIterUDFType = Literal[254]
 
 class ArrowVariadicScalarToScalarFunction(Protocol):
     def __call__(self, *_: pyarrow.Array) -> pyarrow.Array: ...
 
@@ -50,6 +50,8 @@ class ArrowUDFType:
 
     GROUPED_AGG = PythonEvalType.SQL_GROUPED_AGG_ARROW_UDF
 
+    GROUPED_AGG_ITER = PythonEvalType.SQL_GROUPED_AGG_ARROW_ITER_UDF
+
 
 def arrow_udf(f=None, returnType=None, functionType=None):
     """
@@ -301,6 +303,69 @@ def calculate(iterator: Iterator[pa.Array]) -> Iterator[pa.Array]:
             Therefore, mutating the input arrays is not allowed and will cause incorrect results.
             For the same reason, users should also not rely on the index of the input arrays.
 
+    * Iterator of Arrays to Scalar
+        `Iterator[pyarrow.Array]` -> `Any`
+
+        The function takes an iterator of `pyarrow.Array` and returns a scalar value. This is
+        useful for grouped aggregations where the UDF can process all batches for a group
+        iteratively, which is more memory-efficient than loading all data at once. The returned
+        scalar can be a python primitive type, a numpy data type, or a `pyarrow.Scalar` instance.
+
+        .. note:: Only a single UDF is supported per aggregation.
+
+        >>> from typing import Iterator
+        >>> @arrow_udf("double")
+        ... def arrow_mean(it: Iterator[pa.Array]) -> float:
+        ...     sum_val = 0.0
+        ...     cnt = 0
+        ...     for v in it:
+        ...         assert isinstance(v, pa.Array)
+        ...         sum_val += pa.compute.sum(v).as_py()
+        ...         cnt += len(v)
+        ...     return sum_val / cnt
+        ...
+        >>> df = spark.createDataFrame(
+        ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v"))
+        >>> df.groupby("id").agg(arrow_mean(df['v'])).show()
+        +---+-------------+
+        | id|arrow_mean(v)|
+        +---+-------------+
+        |  1|          1.5|
+        |  2|          6.0|
+        +---+-------------+
+
+    * Iterator of Multiple Arrays to Scalar
+        `Iterator[Tuple[pyarrow.Array, ...]]` -> `Any`
+
+        The function takes an iterator of a tuple of multiple `pyarrow.Array` and returns a
+        scalar value. This is useful for grouped aggregations with multiple input columns.
+
+        .. note:: Only a single UDF is supported per aggregation.
+
+        >>> from typing import Iterator, Tuple
+        >>> import numpy as np
+        >>> @arrow_udf("double")
+        ... def arrow_weighted_mean(it: Iterator[Tuple[pa.Array, pa.Array]]) -> float:
+        ...     weighted_sum = 0.0
+        ...     weight = 0.0
+        ...     for v, w in it:
+        ...         assert isinstance(v, pa.Array)
+        ...         assert isinstance(w, pa.Array)
+        ...         weighted_sum += np.dot(v, w)
+        ...         weight += pa.compute.sum(w).as_py()
+        ...     return weighted_sum / weight
+        ...
+        >>> df = spark.createDataFrame(
+        ...     [(1, 1.0, 1.0), (1, 2.0, 2.0), (2, 3.0, 1.0), (2, 5.0, 2.0), (2, 10.0, 3.0)],
+        ...     ("id", "v", "w"))
+        >>> df.groupby("id").agg(arrow_weighted_mean(df["v"], df["w"])).show()
+        +---+-------------------------+
+        | id|arrow_weighted_mean(v, w)|
+        +---+-------------------------+
+        |  1|       1.6666666666666...|
+        |  2|        7.166666666666...|
+        +---+-------------------------+
+
     Notes
     -----
     The user-defined functions do not support conditional expressions or short circuiting
@@ -720,6 +785,7 @@ def vectorized_udf(
         PythonEvalType.SQL_SCALAR_ARROW_UDF,
         PythonEvalType.SQL_SCALAR_ARROW_ITER_UDF,
         PythonEvalType.SQL_GROUPED_AGG_ARROW_UDF,
+        PythonEvalType.SQL_GROUPED_AGG_ARROW_ITER_UDF,
         None,
     ]:  # None means it should infer the type from type hints.
         raise PySparkTypeError(
@@ -768,6 +834,7 @@ def _validate_vectorized_udf(f, evalType, kind: str = "pandas") -> int:
         PythonEvalType.SQL_SCALAR_ARROW_UDF,
         PythonEvalType.SQL_SCALAR_ARROW_ITER_UDF,
         PythonEvalType.SQL_GROUPED_AGG_ARROW_UDF,
+        PythonEvalType.SQL_GROUPED_AGG_ARROW_ITER_UDF,
     ]:
         warnings.warn(
             "It is preferred to specify type hints for "
 
@@ -41,6 +41,7 @@ from pyspark.sql.pandas._typing import (
     ArrowScalarIterFunction,
     ArrowScalarIterUDFType,
     ArrowGroupedAggUDFType,
+    ArrowGroupedAggIterUDFType,
 )
 
 from pyspark import since as since  # noqa: F401
@@ -57,6 +58,7 @@ class ArrowUDFType:
     SCALAR: ArrowScalarUDFType
     SCALAR_ITER: ArrowScalarIterUDFType
     GROUPED_AGG: ArrowGroupedAggUDFType
+    GROUPED_AGG_ITER: ArrowGroupedAggIterUDFType
 
 @overload
 def arrow_udf(
 
@@ -1185,6 +1185,58 @@ def __repr__(self):
         return "ArrowStreamAggArrowUDFSerializer"
 
 
+# Serializer for SQL_GROUPED_AGG_ARROW_ITER_UDF
+class ArrowStreamAggArrowIterUDFSerializer(ArrowStreamArrowUDFSerializer):
+    def __init__(
+        self,
+        timezone,
+        safecheck,
+        assign_cols_by_name,
+        arrow_cast,
+    ):
+        super().__init__(
+            timezone=timezone,
+            safecheck=safecheck,
+            assign_cols_by_name=False,
+            arrow_cast=True,
+        )
+        self._timezone = timezone
+        self._safecheck = safecheck
+        self._assign_cols_by_name = assign_cols_by_name
+        self._arrow_cast = arrow_cast
+
+    def load_stream(self, stream):
+        """
+        Yield an iterator that produces one list of column arrays per batch.
+        Each group yields Iterator[List[pa.Array]], allowing UDF to process batches one by one
+        without consuming all batches upfront.
+        """
+        dataframes_in_group = None
+
+        while dataframes_in_group is None or dataframes_in_group > 0:
+            dataframes_in_group = read_int(stream)
+
+            if dataframes_in_group == 1:
+                # Lazily read and convert Arrow batches one at a time from the stream
+                # This avoids loading all batches into memory for the group
+                batch_iter = (
+                    batch.columns for batch in ArrowStreamSerializer.load_stream(self, stream)
+                )
+                yield batch_iter
+                # Make sure the batches are fully iterated before getting the next group
+                for _ in batch_iter:
+                    pass
+
+            elif dataframes_in_group != 0:
+                raise PySparkValueError(
+                    errorClass="INVALID_NUMBER_OF_DATAFRAMES_IN_GROUP",
+                    messageParameters={"dataframes_in_group": str(dataframes_in_group)},
+                )
+
+    def __repr__(self):
+        return "ArrowStreamAggArrowIterUDFSerializer"
+
+
 # Serializer for SQL_GROUPED_AGG_PANDAS_UDF and SQL_WINDOW_AGG_PANDAS_UDF
 class ArrowStreamAggPandasUDFSerializer(ArrowStreamPandasUDFSerializer):
     def __init__(
 
@@ -29,6 +29,7 @@
         ArrowScalarUDFType,
         ArrowScalarIterUDFType,
         ArrowGroupedAggUDFType,
+        ArrowGroupedAggIterUDFType,
         ArrowGroupedMapIterUDFType,
         ArrowGroupedMapUDFType,
         ArrowGroupedMapFunction,
@@ -156,7 +157,14 @@ def infer_pandas_eval_type(
 
 def infer_arrow_eval_type(
     sig: Signature, type_hints: Dict[str, Any]
-) -> Optional[Union["ArrowScalarUDFType", "ArrowScalarIterUDFType", "ArrowGroupedAggUDFType"]]:
+) -> Optional[
+    Union[
+        "ArrowScalarUDFType",
+        "ArrowScalarIterUDFType",
+        "ArrowGroupedAggUDFType",
+        "ArrowGroupedAggIterUDFType",
+    ]
+]:
     """
     Infers the evaluation type in :class:`pyspark.util.PythonEvalType` from
     :class:`inspect.Signature` instance and type hints.
@@ -235,6 +243,41 @@ def infer_arrow_eval_type(
     if is_array_agg:
         return ArrowUDFType.GROUPED_AGG
 
+    # Iterator[Tuple[pa.Array, ...]] -> Any
+    is_iterator_tuple_array_agg = (
+        len(parameters_sig) == 1
+        and check_iterator_annotation(  # Iterator
+            parameters_sig[0],
+            parameter_check_func=lambda a: check_tuple_annotation(  # Tuple
+                a,
+                parameter_check_func=lambda ta: (ta == Ellipsis or ta == pa.Array),
+            ),
+        )
+        and (
+            return_annotation != pa.Array
+            and not check_iterator_annotation(return_annotation)
+            and not check_tuple_annotation(return_annotation)
+        )
+    )
+    if is_iterator_tuple_array_agg:
+        return ArrowUDFType.GROUPED_AGG_ITER
+
+    # Iterator[pa.Array] -> Any
+    is_iterator_array_agg = (
+        len(parameters_sig) == 1
+        and check_iterator_annotation(
+            parameters_sig[0],
+            parameter_check_func=lambda a: a == pa.Array,
+        )
+        and (
+            return_annotation != pa.Array
+            and not check_iterator_annotation(return_annotation)
+            and not check_tuple_annotation(return_annotation)
+        )
+    )
+    if is_iterator_array_agg:
+        return ArrowUDFType.GROUPED_AGG_ITER
+
     return None
 
 
@@ -249,6 +292,7 @@ def infer_eval_type(
     "ArrowScalarUDFType",
     "ArrowScalarIterUDFType",
     "ArrowGroupedAggUDFType",
+    "ArrowGroupedAggIterUDFType",
 ]:
     """
     Infers the evaluation type in :class:`pyspark.util.PythonEvalType` from
@@ -264,6 +308,7 @@ def infer_eval_type(
             "ArrowScalarUDFType",
             "ArrowScalarIterUDFType",
             "ArrowGroupedAggUDFType",
+            "ArrowGroupedAggIterUDFType",
         ]
     ] = None
     if kind == "pandas":
@@ -295,6 +340,7 @@ def infer_eval_type_for_udf(  # type: ignore[no-untyped-def]
         "ArrowScalarUDFType",
         "ArrowScalarIterUDFType",
         "ArrowGroupedAggUDFType",
+        "ArrowGroupedAggIterUDFType",
     ]
 ]:
     argspec = getfullargspec(f)
Original file line number	Diff line number	Diff line change
`@@ -75,6 +75,7 @@ private[spark] object PythonEvalType {`
`75`	`75`	`val SQL_SCALAR_ARROW_ITER_UDF = 251`
`76`	`76`	`val SQL_GROUPED_AGG_ARROW_UDF = 252`
`77`	`77`	`val SQL_WINDOW_AGG_ARROW_UDF = 253`
	`78`	`+ val SQL_GROUPED_AGG_ARROW_ITER_UDF = 254`
`78`	`79`
`79`	`80`	`val SQL_TABLE_UDF = 300`
`80`	`81`	`val SQL_ARROW_TABLE_UDF = 301`
`@@ -112,6 +113,7 @@ private[spark] object PythonEvalType {`
`112`	`113`	`case SQL_SCALAR_ARROW_ITER_UDF => "SQL_SCALAR_ARROW_ITER_UDF"`
`113`	`114`	`case SQL_GROUPED_AGG_ARROW_UDF => "SQL_GROUPED_AGG_ARROW_UDF"`
`114`	`115`	`case SQL_WINDOW_AGG_ARROW_UDF => "SQL_WINDOW_AGG_ARROW_UDF"`
	`116`	`+ case SQL_GROUPED_AGG_ARROW_ITER_UDF => "SQL_GROUPED_AGG_ARROW_ITER_UDF"`
`115`	`117`	`}`
`116`	`118`	`}`
`117`	`119`