Merge branch 'develop' of github.com:ROCm/AMDMIGraphX into ins_debug_symbols

CharlieL7 · CharlieL7 · commit 42eaac8361ad · 2026-02-20T15:40:05.000-06:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,8 @@ Full documentation for MIGraphX is available at
 
 ### Optimized
 
+* Added a new pass to replace convolution with constant broadcast input with a reduced GEMM which improves model compilation time (#4621).
+
 ### Removed
 
 ## MIGraphX 2.15 for ROCm 7.2.0
diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.31.3
+rocm-docs-core==1.32.0
 sphinx-collapse
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
@@ -61,7 +61,7 @@ click==8.1.7
     #   sphinx-external-toc
 comm==0.2.2
     # via ipykernel
-cryptography==44.0.1
+cryptography==46.0.5
     # via pyjwt
 debugpy==1.8.12
     # via ipykernel
@@ -212,7 +212,7 @@ requests==2.32.4
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==1.31.3
+rocm-docs-core==1.32.0
     # via -r requirements.in
 rpds-py==0.22.3
     # via
@@ -285,8 +285,9 @@ traitlets==5.14.3
     #   matplotlib-inline
     #   nbclient
     #   nbformat
-typing-extensions==4.12.2
+typing-extensions==4.15.0
     # via
+    #   cryptography
     #   ipython
     #   myst-nb
     #   pydata-sphinx-theme
diff --git a/src/propagate_constant.cpp b/src/propagate_constant.cpp
@@ -59,13 +59,13 @@ static bool is_const_ins(instruction_ref ins, const std::unordered_set<std::stri
            skip_ops.find(ins->name()) == skip_ops.end();
 }
 
-static argument as_packed(const argument& c)
+static literal as_packed(const argument& c)
 {
     if(c.get_shape().packed())
-        return c;
+        return {c.get_shape(), c.data()};
     auto s = c.get_shape().with_lens(c.get_shape().lens());
-    argument result;
-    c.visit([&](auto x) { result = literal{s, x.begin(), x.end()}.get_argument(); });
+    literal result;
+    c.visit([&](auto x) { result = literal{s, x.begin(), x.end()}; });
     return result;
 }
 
@@ -98,11 +98,16 @@ void propagate_constant::apply(module& m) const
 
     // Compute literals in parallel
     std::vector<instruction_ref> const_instrs_vec{const_instrs.begin(), const_instrs.end()};
-    std::vector<argument> literals(const_instrs_vec.size());
+    std::vector<literal> literals(const_instrs_vec.size());
     std::size_t grainsize = 1;
+#ifdef _WIN32
+    grainsize = std::max<std::size_t>(
+        const_instrs_vec.size() / (std::thread::hardware_concurrency() / 2), 1);
+#else
 #if !MIGRAPHX_HAS_EXECUTORS
     std::size_t n = std::max<std::size_t>(2048 / std::thread::hardware_concurrency(), 1);
     grainsize     = const_instrs_vec.size() / n;
+#endif
 #endif
     simple_par_for(const_instrs_vec.size(), grainsize, [&](const auto i) {
         literals[i] = as_packed(const_instrs_vec[i]->eval());
@@ -128,7 +133,7 @@ void propagate_constant::apply(module& m) const
             }
             assert(literals[i].get_shape().lens() == const_instrs_vec[i]->get_shape().lens());
             assert(literals[i].get_shape().bytes() <= const_instrs_vec[i]->get_shape().bytes());
-            auto l = m.add_literal(literals[i].get_shape(), literals[i].data());
+            auto l = m.add_literal(literals[i]);
             m.replace_instruction(const_instrs_vec[i], l);
         }
     }
diff --git a/src/simplify_algebra.cpp b/src/simplify_algebra.cpp
@@ -2160,11 +2160,83 @@ struct find_split_transpose
     }
 };
 
+// When a convolution's input is a spatially-broadcast constant (e.g. a bias
+// vector broadcast to [N, IC, H, W] with stride-0 spatial dims), the full
+// spatial convolution is redundant.  Replace it with:
+//   W_reduced[oc,ic] = sum_{kh,kw} W[oc,ic,kh,kw]   (reduce_sum)
+//   result = dot(input_2d, W_reduced^T)               (tiny GEMM)
+//   multibroadcast result to the original output shape
+struct find_conv_broadcast_input
+{
+    auto matcher() const
+    {
+        return match::name("convolution")(match::args(
+            match::name("broadcast", "multibroadcast")(match::args(match::any().bind("x")))
+                .bind("bcast"),
+            match::is_constant().bind("w")));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins   = r.result;
+        auto x_ins = r.instructions["x"];
+        auto w_ins = r.instructions["w"];
+
+        if(ins->get_operator().to_value()["group"].to<int>() != 1)
+            return;
+
+        const auto& x_shape = x_ins->get_shape();
+        const auto& w_shape = w_ins->get_shape();
+
+        const auto& x_lens = x_shape.lens();
+        if(x_lens.size() > 2 and
+           std::any_of(x_lens.begin() + 2, x_lens.end(), [](auto l) { return l != 1; }))
+            return;
+
+        auto oc = w_shape.lens()[0];
+        auto ic = w_shape.lens()[1];
+
+        auto out_lens = ins->get_shape().lens();
+        auto n        = out_lens[0];
+
+        if(x_shape.elements() != n * ic)
+            return;
+
+        auto ndim = w_shape.ndim();
+        std::vector<int64_t> spatial_axes(ndim - 2);
+        std::iota(spatial_axes.begin(), spatial_axes.end(), 2);
+
+        auto w_reduced =
+            m.insert_instruction(ins, make_op("reduce_sum", {{"axes", spatial_axes}}), w_ins);
+        auto w_2d = m.insert_instruction(
+            ins, make_op("reshape", {{"dims", std::vector<std::size_t>{oc, ic}}}), w_reduced);
+        auto w_t = m.insert_instruction(
+            ins, make_op("transpose", {{"permutation", std::vector<int64_t>{1, 0}}}), w_2d);
+
+        instruction_ref x_2d;
+        if(x_shape.ndim() == 1 and n == 1)
+            x_2d = m.insert_instruction(
+                ins, make_op("unsqueeze", {{"axes", std::vector<int64_t>{0}}}), x_ins);
+        else
+            x_2d = m.insert_instruction(
+                ins, make_op("reshape", {{"dims", std::vector<std::size_t>{n, ic}}}), x_ins);
+
+        auto dot_result = m.insert_instruction(ins, make_op("dot"), x_2d, w_t);
+
+        auto dot_1d = m.insert_instruction(
+            ins, make_op("squeeze", {{"axes", std::vector<int64_t>{0}}}), dot_result);
+
+        m.replace_instruction(
+            ins, make_op("broadcast", {{"axis", 1}, {"out_lens", out_lens}}), dot_1d);
+    }
+};
+
 void simplify_algebra::apply(module& m) const
 {
     // Run simplifications multiple times
     m.repeat_while_changes(8, [&] {
         match::find_matches(m,
+                            find_conv_broadcast_input{},
                             find_inner_broadcast{},
                             find_dot_broadcast{},
                             find_double_add_lit_broadcast{},
diff --git a/test/py/requirements-onnx.txt b/test/py/requirements-onnx.txt
@@ -23,7 +23,7 @@
 #####################################################################################
 onnx==1.18.0;python_version>="3.11"
 onnx==1.14.1;python_version<"3.11"
-protobuf==4.25.8
+protobuf==5.29.6
 numpy==1.26.4;python_version>="3.11"
 numpy==1.21.6;python_version<"3.11"
 packaging==23.0
diff --git a/test/simplify_algebra_test.cpp b/test/simplify_algebra_test.cpp
@@ -1,7 +1,7 @@
 /*
  * The MIT License (MIT)
  *
- * Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -4733,4 +4733,88 @@ TEST_CASE(find_concat_different_broadcast_axes)
     EXPECT(m1.sort() == m2.sort());
 }
 
+TEST_CASE(conv_broadcast_input)
+{
+    migraphx::shape xs{migraphx::shape::float_type, {64}};
+    migraphx::shape ws{migraphx::shape::float_type, {64, 64, 3, 3}};
+    migraphx::module m1;
+    {
+        auto x     = m1.add_parameter("x", xs);
+        auto bcast = m1.add_instruction(
+            migraphx::make_op("broadcast", {{"axis", 1}, {"out_lens", {1, 64, 4, 4}}}), x);
+        auto w    = m1.add_literal(migraphx::generate_literal(ws, 1));
+        auto conv = m1.add_instruction(migraphx::make_op("convolution"), bcast, w);
+        m1.add_instruction(pass_op{}, conv);
+    }
+    run_pass(m1);
+
+    migraphx::module m2;
+    {
+        auto x   = m2.add_parameter("x", xs);
+        auto w   = m2.add_literal(migraphx::generate_literal(ws, 1));
+        auto wr  = m2.add_instruction(migraphx::make_op("reduce_sum", {{"axes", {2, 3}}}), w);
+        auto w2d = m2.add_instruction(migraphx::make_op("reshape", {{"dims", {64, 64}}}), wr);
+        auto wt =
+            m2.add_instruction(migraphx::make_op("transpose", {{"permutation", {1, 0}}}), w2d);
+        auto x2d = m2.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {0}}}), x);
+        auto dr  = m2.add_instruction(migraphx::make_op("dot"), x2d, wt);
+        auto d1  = m2.add_instruction(migraphx::make_op("squeeze", {{"axes", {0}}}), dr);
+        auto r   = m2.add_instruction(
+            migraphx::make_op("broadcast", {{"axis", 1}, {"out_lens", {1, 64, 2, 2}}}), d1);
+        m2.add_instruction(pass_op{}, r);
+    }
+    EXPECT(m1.sort() == m2.sort());
+}
+
+TEST_CASE(conv_multibroadcast_input)
+{
+    migraphx::shape xs{migraphx::shape::float_type, {1, 64, 1, 1}};
+    migraphx::shape ws{migraphx::shape::float_type, {64, 64, 3, 3}};
+    migraphx::module m1;
+    {
+        auto x     = m1.add_parameter("x", xs);
+        auto bcast = m1.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", {1, 64, 4, 4}}}), x);
+        auto w    = m1.add_literal(migraphx::generate_literal(ws, 1));
+        auto conv = m1.add_instruction(migraphx::make_op("convolution"), bcast, w);
+        m1.add_instruction(pass_op{}, conv);
+    }
+    run_pass(m1);
+
+    migraphx::module m2;
+    {
+        auto x   = m2.add_parameter("x", xs);
+        auto w   = m2.add_literal(migraphx::generate_literal(ws, 1));
+        auto wr  = m2.add_instruction(migraphx::make_op("reduce_sum", {{"axes", {2, 3}}}), w);
+        auto w2d = m2.add_instruction(migraphx::make_op("reshape", {{"dims", {64, 64}}}), wr);
+        auto wt =
+            m2.add_instruction(migraphx::make_op("transpose", {{"permutation", {1, 0}}}), w2d);
+        auto x2d = m2.add_instruction(migraphx::make_op("reshape", {{"dims", {1, 64}}}), x);
+        auto dr  = m2.add_instruction(migraphx::make_op("dot"), x2d, wt);
+        auto d1  = m2.add_instruction(migraphx::make_op("squeeze", {{"axes", {0}}}), dr);
+        auto r   = m2.add_instruction(
+            migraphx::make_op("broadcast", {{"axis", 1}, {"out_lens", {1, 64, 2, 2}}}), d1);
+        m2.add_instruction(pass_op{}, r);
+    }
+    EXPECT(m1.sort() == m2.sort());
+}
+
+TEST_CASE(conv_broadcast_input_group)
+{
+    migraphx::shape xs{migraphx::shape::float_type, {64}};
+    migraphx::shape ws{migraphx::shape::float_type, {64, 32, 3, 3}};
+    migraphx::module m1;
+    {
+        auto x     = m1.add_parameter("x", xs);
+        auto bcast = m1.add_instruction(
+            migraphx::make_op("broadcast", {{"axis", 1}, {"out_lens", {1, 64, 4, 4}}}), x);
+        auto w    = m1.add_literal(migraphx::generate_literal(ws, 1));
+        auto conv = m1.add_instruction(migraphx::make_op("convolution", {{"group", 2}}), bcast, w);
+        m1.add_instruction(pass_op{}, conv);
+    }
+    migraphx::module m2 = m1;
+    run_pass(m1);
+    EXPECT(m1.sort() == m2.sort());
+}
+
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
diff --git a/tools/requirements-py.txt b/tools/requirements-py.txt
@@ -1,7 +1,7 @@
 #####################################################################################
 # The MIT License (MIT)
 #
-# Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -29,4 +29,4 @@ typing==3.7.4
 pytest==6.0.1 
 packaging==23.0
 # pin version of protobuf in Python for onnx runtime unit tests between dist versions
-protobuf==4.25.8
+protobuf==6.33.5

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-rocm-docs-core==1.31.3`
	`1`	`+rocm-docs-core==1.32.0`
`2`	`2`	`sphinx-collapse`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`#####################################################################################`
`2`	`2`	`# The MIT License (MIT)`
`3`	`3`	`#`
`4`		`-# Copyright (c) 2015-2025 Advanced Micro Devices, Inc. All rights reserved.`
	`4`	`+# Copyright (c) 2015-2026 Advanced Micro Devices, Inc. All rights reserved.`
`5`	`5`	`#`
`6`	`6`	`# Permission is hereby granted, free of charge, to any person obtaining a copy`
`7`	`7`	`# of this software and associated documentation files (the "Software"), to deal`
`@@ -29,4 +29,4 @@ typing==3.7.4`
`29`	`29`	`pytest==6.0.1`
`30`	`30`	`packaging==23.0`
`31`	`31`	`# pin version of protobuf in Python for onnx runtime unit tests between dist versions`
`32`		`-protobuf==4.25.8`
	`32`	`+protobuf==6.33.5`