FEAT-#7676: in-place casting between DataFrame engines (#7666)

sfc-gh-jkew · sfc-gh-joshi · sfc-gh-mvashishtha · web-flow · commit 6e861d0b2fee · 2025-09-25T17:08:37.000-07:00
Addresses a performance issue with hybrid execution where a sequence of merge operations using the same DataFrames could result in transfer thrashing. In this PR we have the arguments of the operation move in-place so that subsequent operations all stay on the same backend. This behavior can be turned off by setting the `BackendMergeCastInPlace` variable to false.  - [x] first commit message and PR title follow format outlined [here](https://modin.readthedocs.io/en/latest/development/contributing.html#commit-message-formatting) > **_NOTE:_** If you edit the PR title to match this format, you need to add another commit (even if it's empty) or amend your last commit for the CI job that checks the PR title to pick up the new PR title. - [x] passes `flake8 modin/ asv_bench/benchmarks scripts/doc_checker.py` - [x] passes `black --check modin/ asv_bench/benchmarks scripts/doc_checker.py` - [x] signed commit with `git commit -s`  - [x] Resolves #7676  - [x] tests added and passing - [x] module layout described at `docs/development/architecture.rst` is up-to-date  --------- Co-authored-by: Jonathan Shi <149419494+sfc-gh-joshi@users.noreply.github.com> Co-authored-by: Mahesh Vashishtha <mahesh.vashishtha@snowflake.com>
diff --git a/modin/config/__init__.py b/modin/config/__init__.py
@@ -19,6 +19,7 @@
     AsyncReadMode,
     AutoSwitchBackend,
     Backend,
+    BackendMergeCastInPlace,
     BenchmarkMode,
     CIAWSAccessKeyID,
     CIAWSSecretAccessKey,
@@ -78,6 +79,7 @@
     "GpuCount",
     "Memory",
     "Backend",
+    "BackendMergeCastInPlace",
     "Execution",
     "AutoSwitchBackend",
     "ShowBackendSwitchProgress",
diff --git a/modin/config/envvars.py b/modin/config/envvars.py
@@ -1385,6 +1385,30 @@ def disable(cls) -> None:
         cls.put(False)
 
 
+class BackendMergeCastInPlace(EnvironmentVariable, type=bool):
+    """
+    Whether to cast a DataFrame in-place when performing a merge when using hybrid mode.
+
+    This flag modifies the behavior of a cast performed on operations involving more
+    than one type of query compiler. If enabled the actual cast will be performed in-place
+    and the input DataFrame will have a new backend. If disabled the original DataFrame
+    will remain on the same underlying engine.
+    """
+
+    varname = "MODIN_BACKEND_MERGE_CAST_IN_PLACE"
+    default = True
+
+    @classmethod
+    def enable(cls) -> None:
+        """Enable casting in place when performing a merge operation betwen two different compilers."""
+        cls.put(True)
+
+    @classmethod
+    def disable(cls) -> None:
+        """Disable casting in place when performing a merge operation betwen two different compilers."""
+        cls.put(False)
+
+
 class DynamicPartitioning(EnvironmentVariable, type=bool):
     """
     Set to true to use Modin's dynamic-partitioning implementation where possible.
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
@@ -2717,6 +2717,8 @@ def fillna(self, **kwargs):
         full_axis = method is not None or limit is not None
         new_dtypes = None
         if isinstance(value, BaseQueryCompiler):
+            # This code assumes that the operation occurs with the same query compiler
+            assert isinstance(value, PandasQueryCompiler)
             if squeeze_self:
                 # Self is a Series type object
                 if full_axis:
diff --git a/modin/core/storage_formats/pandas/query_compiler_caster.py b/modin/core/storage_formats/pandas/query_compiler_caster.py
@@ -31,7 +31,7 @@
 from pandas.core.indexes.frozen import FrozenList
 from typing_extensions import Self
 
-from modin.config import AutoSwitchBackend, Backend
+from modin.config import AutoSwitchBackend, Backend, BackendMergeCastInPlace
 from modin.config import context as config_context
 from modin.core.storage_formats.base.query_compiler import (
     BaseQueryCompiler,
@@ -92,12 +92,12 @@ def _normalize_class_name(class_of_wrapped_fn: Optional[str]) -> str:
     "_getattr__from_extension_impl",
     "get_backend",
     "move_to",
-    "_update_inplace",
     "set_backend",
     "_get_extension",
     "_query_compiler",
     "_get_query_compiler",
     "_copy_into",
+    "_update_inplace",
     "is_backend_pinned",
     "_set_backend_pinned",
     "pin_backend",
@@ -121,6 +121,7 @@ def _normalize_class_name(class_of_wrapped_fn: Optional[str]) -> str:
     "_set_backend_pinned",
     "pin_backend",
     "unpin_backend",
+    "_update_inplace",
 }
 
 
@@ -1120,10 +1121,20 @@ def cast_to_qc(arg):
                     and arg.get_backend() != result_backend
                 ):
                     return arg
-                cast = arg.set_backend(
-                    result_backend,
-                    switch_operation=f"{_normalize_class_name(class_of_wrapped_fn)}.{name}",
-                )
+                if BackendMergeCastInPlace.get():
+                    arg.set_backend(
+                        result_backend,
+                        switch_operation=f"{_normalize_class_name(class_of_wrapped_fn)}.{name}",
+                        inplace=True,
+                    )
+                    assert arg.get_backend() == result_backend
+                    cast = arg
+                else:
+                    cast = arg.set_backend(
+                        result_backend,
+                        switch_operation=f"{_normalize_class_name(class_of_wrapped_fn)}.{name}",
+                        inplace=False,
+                    )
                 inplace_update_trackers.append(
                     InplaceUpdateTracker(
                         input_castable=arg,
@@ -1156,7 +1167,7 @@ def cast_to_qc(arg):
             new_castable,
         ) in inplace_update_trackers:
             new_qc = new_castable._get_query_compiler()
-            if original_qc is not new_qc:
+            if BackendMergeCastInPlace.get() or original_qc is not new_qc:
                 new_castable._copy_into(original_castable)
 
         return _maybe_switch_backend_post_op(
diff --git a/modin/pandas/series.py b/modin/pandas/series.py
@@ -2925,6 +2925,17 @@ def set_backend(
         *,
         switch_operation: Optional[str] = None,
     ) -> Optional[Self]:
+        # A series which is moved, potentially without its parent needs to
+        # have it's parent reset. This is aligned with CoW chained assigment
+        # semantics as well, but it is a little different from existing modin
+        # semantics. This is why we only do this for hybrid and inplace
+        # modification.
+        if (
+            inplace
+            and self._parent is not None
+            and backend != self._parent.get_backend()
+        ):
+            self._parent = None
         return super().set_backend(
             backend=backend, inplace=inplace, switch_operation=switch_operation
         )
diff --git a/modin/tests/pandas/extensions/test_pd_extensions.py b/modin/tests/pandas/extensions/test_pd_extensions.py
@@ -163,6 +163,10 @@ def python_concat(*args, **kwargs):
             == "pandas_concat_result"
         )
 
+        # With inplace casting we need to reset the original dataframes
+        modin_on_pandas_df.move_to("Pandas", inplace=True)
+        modin_on_python_df.move_to("Python_Test", inplace=True)
+
         assert (
             pd.concat([modin_on_python_df, modin_on_pandas_df])
             == "python_concat_result"
diff --git a/modin/tests/pandas/native_df_interoperability/test_compiler_caster.py b/modin/tests/pandas/native_df_interoperability/test_compiler_caster.py
@@ -413,7 +413,8 @@ def test_two_same_backend(pico_df):
 
 def test_cast_to_second_backend_with_concat(pico_df, cluster_df, caplog):
     with caplog.at_level(level=logging.INFO, logger=DEFAULT_LOGGER_NAME):
-        df3 = pd.concat([pico_df, cluster_df], axis=1)
+        # We have to copy the input dataframes because of inplace merging
+        df3 = pd.concat([pico_df.copy(), cluster_df.copy()], axis=1)
     assert pico_df.get_backend() == "Pico"
     assert cluster_df.get_backend() == "Cluster"
     assert df3.get_backend() == "Cluster"  # result should be on cluster
@@ -431,7 +432,10 @@ def test_cast_to_second_backend_with_concat_uses_second_backend_api_override(
     register_pd_accessor(name="concat", backend="Cluster")(
         lambda *args, **kwargs: "custom_concat_result"
     )
-    assert pd.concat([pico_df, cluster_df], axis=1) == "custom_concat_result"
+    # copy dataframes for concat to allow for in-place merging
+    assert (
+        pd.concat([pico_df.copy(), cluster_df.copy()], axis=1) == "custom_concat_result"
+    )
     assert pico_df.get_backend() == "Pico"
     assert cluster_df.get_backend() == "Cluster"
 
@@ -449,14 +453,16 @@ def test_moving_pico_to_cluster_in_place_calls_set_backend_only_once_github_issu
 
 def test_cast_to_second_backend_with___init__(pico_df, cluster_df):
     df3 = pd.DataFrame({"pico": pico_df.iloc[:, 0], "cluster": cluster_df.iloc[:, 0]})
-    assert pico_df.get_backend() == "Pico"
+    assert (
+        pico_df.get_backend() == "Pico"
+    )  # pico stays despite in-place casting by iloc
     assert cluster_df.get_backend() == "Cluster"
     assert df3.get_backend() == "Cluster"  # result should be on cluster
 
 
 def test_cast_to_first_backend(pico_df, cluster_df):
     df3 = pd.concat([cluster_df, pico_df], axis=1)
-    assert pico_df.get_backend() == "Pico"
+    assert pico_df.get_backend() == "Cluster"  # pico_df was cast in place by concat
     assert cluster_df.get_backend() == "Cluster"
     assert df3.get_backend() == cluster_df.get_backend()  # result should be on cluster
 
@@ -468,7 +474,7 @@ def test_cast_to_first_backend_with_concat_uses_first_backend_api_override(
         lambda *args, **kwargs: "custom_concat_result"
     )
     assert pd.concat([cluster_df, pico_df], axis=1) == "custom_concat_result"
-    assert pico_df.get_backend() == "Pico"
+    assert pico_df.get_backend() == "Cluster"  # pico was cast in place by concat
     assert cluster_df.get_backend() == "Cluster"
 
 
@@ -479,7 +485,7 @@ def test_cast_to_first_backend_with___init__(pico_df, cluster_df):
             "pico": pico_df.iloc[:, 0],
         }
     )
-    assert pico_df.get_backend() == "Pico"
+    assert pico_df.get_backend() == "Pico"  # Pico not cast in place by iloc
     assert cluster_df.get_backend() == "Cluster"
     assert df3.get_backend() == "Cluster"  # result should be on cluster
 
@@ -557,31 +563,33 @@ def test_two_two_qc_types_default_rhs(default_df, cluster_df):
     # so we default to the caller
     df3 = pd.concat([default_df, cluster_df], axis=1)
     assert default_df.get_backend() == "Test_casting_default"
-    assert cluster_df.get_backend() == "Cluster"
+    assert (
+        cluster_df.get_backend() == "Test_casting_default"
+    )  # in place cast to default by concat
     assert df3.get_backend() == default_df.get_backend()  # should move to default
 
 
 def test_two_two_qc_types_default_lhs(default_df, cluster_df):
     # none of the query compilers know about each other here
     # so we default to the caller
     df3 = pd.concat([cluster_df, default_df], axis=1)
-    assert default_df.get_backend() == "Test_casting_default"
+    assert default_df.get_backend() == "Cluster"  # in place cast to Cluster by concat
     assert cluster_df.get_backend() == "Cluster"
     assert df3.get_backend() == cluster_df.get_backend()  # should move to cluster
 
 
 def test_two_two_qc_types_default_2_rhs(default_df, cloud_df):
     # cloud knows a bit about costing; so we prefer moving to there
     df3 = pd.concat([default_df, cloud_df], axis=1)
-    assert default_df.get_backend() == "Test_casting_default"
+    assert default_df.get_backend() == "Cloud"  # inplace cast to Cloud by concat
     assert cloud_df.get_backend() == "Cloud"
     assert df3.get_backend() == cloud_df.get_backend()  # should move to cloud
 
 
 def test_two_two_qc_types_default_2_lhs(default_df, cloud_df):
     # cloud knows a bit about costing; so we prefer moving to there
     df3 = pd.concat([cloud_df, default_df], axis=1)
-    assert default_df.get_backend() == "Test_casting_default"
+    assert default_df.get_backend() == "Cloud"  # inplace cast to Cloud by concat
     assert cloud_df.get_backend() == "Cloud"
     assert df3.get_backend() == cloud_df.get_backend()  # should move to cloud
 
@@ -651,6 +659,22 @@ def test_qc_mixed_loc(pico_df, cloud_df):
     assert cloud_df1[pico_df1[0][0]][pico_df1[0][1]] == 1
 
 
+def test_merge_in_place(default_df, lazy_df, cloud_df):
+    # lazy_df tries to pawn off work on other engines
+    df = default_df.merge(lazy_df)
+    assert df.get_backend() is default_df.get_backend()
+    # Both arguments now have the same qc type
+    assert lazy_df.get_backend() is default_df.get_backend()
+
+    with config_context(BackendMergeCastInPlace=False):
+        lazy_df = lazy_df.move_to("Lazy")
+        cloud_df = cloud_df.move_to("Cloud")
+        df = cloud_df.merge(lazy_df)
+        assert type(df) is type(cloud_df)
+        assert lazy_df.get_backend() == "Lazy"
+        assert cloud_df.get_backend() == "Cloud"
+
+
 def test_information_asymmetry(default_df, cloud_df, eager_df, lazy_df):
     # normally, the default query compiler should be chosen
     # here, but since eager knows about default, but not
@@ -1487,7 +1511,11 @@ def test_groupby_apply_switches_for_small_input(
             pandas_result = operation(pandas_df)
             df_equals(modin_result, pandas_result)
             assert modin_result.get_backend() == expected_backend
-            assert modin_df.get_backend() == expected_backend
+            if groupby_class == "DataFrameGroupBy":
+                assert modin_df.get_backend() == expected_backend
+            # The original dataframe does not move with the SeriesGroupBy
+            if groupby_class == "SeriesGroupBy":
+                assert modin_df.get_backend() == "Big_Data_Cloud"
 
     def test_T_switches(self):
         # Ensure that calling df.T triggers a switch (GH#7653)
diff --git a/modin/tests/pandas/native_df_interoperability/utils.py b/modin/tests/pandas/native_df_interoperability/utils.py
@@ -15,6 +15,8 @@
 
 from modin import set_execution
 from modin.config import Engine, StorageFormat
+from modin.config import context as config_context
+from modin.config.envvars import Backend
 from modin.tests.pandas.utils import (
     NoModinException,
     create_test_dfs,
@@ -43,9 +45,13 @@ def create_test_df_in_defined_mode(
 
     if not isinstance(native, bool):
         raise ValueError("`native` should be True or False.")
-
+    hybrid_backend = "Pandas" if native else Backend.get()
     with switch_to_native_execution() if native else nullcontext():
-        return create_test_dfs(*args, post_fn=post_fn, backend=backend, **kwargs)
+        with config_context(AutoSwitchBackend=False, Backend=hybrid_backend):
+            modin_df, pandas_df = create_test_dfs(
+                *args, post_fn=post_fn, backend=backend, **kwargs
+            )
+            return modin_df, pandas_df
 
 
 def create_test_series_in_defined_mode(
@@ -56,8 +62,13 @@ def create_test_series_in_defined_mode(
     if not isinstance(native, bool):
         raise ValueError("`native` should be True or False.")
 
+    hybrid_backend = "Pandas" if native else "Ray"
     with switch_to_native_execution() if native else nullcontext():
-        return create_test_series(vals, sort=sort, backend=backend, **kwargs)
+        with config_context(AutoSwitchBackend=False, Backend=hybrid_backend):
+            modin_ser, pandas_ser = create_test_series(
+                vals, sort=sort, backend=backend, **kwargs
+            )
+        return modin_ser, pandas_ser
 
 
 def eval_general_interop(
@@ -110,7 +121,7 @@ def execute_callable(fn, inplace=False, md_kwargs={}, pd_kwargs={}):
                     assert (
                         type(md_e) is type(expected_exception)
                         and md_e.args == expected_exception.args
-                    ), f"not acceptable Modin's exception: [{repr(md_e)}]"
+                    ), f"not acceptable Modin's exception: [{repr(md_e)}] expected {expected_exception}"
                     assert (
                         pd_e.args == expected_exception.args
                     ), f"not acceptable Pandas' exception: [{repr(pd_e)}]"