pytorch
diff --git a/‎.ci/scripts/setup-windows-msvc.ps1‎
Lines changed: 52 additions & 0 deletions b/‎.ci/scripts/setup-windows-msvc.ps1‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/windows-msvc.yml‎
Lines changed: 35 additions & 0 deletions b/‎.github/workflows/windows-msvc.yml‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎.mypy.ini‎
Lines changed: 6 additions & 0 deletions b/‎.mypy.ini‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 2 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎backends/arm/_passes/annotate_decomposed_matmul.py‎
Lines changed: 4 additions & 5 deletions b/‎backends/arm/_passes/annotate_decomposed_matmul.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎backends/arm/_passes/arm_pass.py‎
Lines changed: 0 additions & 1 deletion b/‎backends/arm/_passes/arm_pass.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 21 additions & 23 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 21 additions & 23 deletions
diff --git a/‎backends/arm/_passes/arm_pass_utils.py‎
Lines changed: 6 additions & 4 deletions b/‎backends/arm/_passes/arm_pass_utils.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎backends/arm/_passes/cast_int64_pass.py‎
Lines changed: 0 additions & 1 deletion b/‎backends/arm/_passes/cast_int64_pass.py‎
Lines changed: 0 additions & 1 deletion
@@ -0,0 +1,52 @@
+conda create --yes --quiet -n et python=3.12
+conda activate et
+
+# Install cmake
+conda install -y cmake
+
+# Activate the VS environment - this is required for MSVC to work
+# There are a bunch of environment variables that it requires.
+# See https://learn.microsoft.com/en-us/cpp/build/building-on-the-command-line.
+& "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Launch-VsDevShell.ps1" -Arch amd64
+
+# Install CI requirements
+pip install -r .ci/docker/requirements-ci.txt
+
+# Create build directory
+$buildDir = "cmake-out-msvc"
+if (Test-Path -Path $buildDir) {
+    Remove-Item -Path $buildDir -Recurse -Force
+}
+New-Item -Path $buildDir -ItemType Directory
+
+# Configure CMake with MSVC (not ClangCL) and disable custom/quantized ops
+cmake -S . -B $buildDir `
+    -DCMAKE_BUILD_TYPE=Release `
+    -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON `
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON `
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON `
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON `
+    -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON `
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON `
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON `
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF `
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM_AOT=OFF `
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=OFF `
+    -DEXECUTORCH_BUILD_XNNPACK=ON `
+    -DEXECUTORCH_BUILD_EXTENSION_LLM=ON `
+    -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON
+
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "CMake configuration failed. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
+
+# Build with MSVC
+cmake --build $buildDir --config Release -j16
+
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "Build failed. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
+
+Write-Host "MSVC build completed successfully!"
@@ -89,6 +89,8 @@ jobs:
 
   export-voxtral-cuda-artifact:
     name: export-voxtral-cuda-${{ matrix.quant.name }}
+    # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
+    if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -166,6 +168,8 @@ jobs:
 
   export-gemma3-cuda-artifact:
     name: export-gemma3-cuda-${{ matrix.quant.name }}
+    # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
+    if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
 
@@ -0,0 +1,35 @@
+name: Windows MSVC Build
+
+on:
+  push:
+    branches:
+      - main
+      - release/*
+    tags:
+      - ciflow/trunk/*
+  pull_request:
+    paths:
+      - .ci/docker/ci_commit_pins/pytorch.txt
+      - .ci/scripts/**
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  build-windows-msvc:
+    name: build-windows-msvc
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    with:
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 60
+      script: |
+        conda init powershell
+        powershell -Command "& {
+          Set-PSDebug -Trace 1
+          \$ErrorActionPreference = 'Stop'
+          \$PSNativeCommandUseErrorActionPreference = \$true
+          .ci/scripts/setup-windows-msvc.ps1
+        }"
@@ -83,6 +83,12 @@ ignore_missing_imports = True
 [mypy-tosa_tools.*]
 ignore_missing_imports = True
 
+[mypy-tosa_serializer]
+ignore_missing_imports = True
+
+[mypy-tosa_serializer.*]
+ignore_missing_imports = True
+
 [mypy-setuptools.*]
 ignore_missing_imports = True
 
 
@@ -88,8 +88,7 @@
 from .mm_to_bmm_pass import ConvertMmToBmmPass  # noqa
 from .remove_noop_pass import RemoveNoopPass  # noqa
 from .replace_scalar_with_tensor_pass import (  # noqa
-    ReplaceScalarWithTensorArgPassTOSABI,
-    ReplaceScalarWithTensorArgPassTOSAMI,
+    ReplaceScalarWithTensorByProfilePass,
 )
 from .rewrite_conv2d_pass import RewriteConv2dPass  # noqa
 from .rewrite_matmul import RewriteMatmulPass  # noqa
 
@@ -3,7 +3,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-unsafe
 
 import itertools
 import operator
@@ -52,7 +51,7 @@ def _match_partition_to_node(
         raise RuntimeError(f"Cannot find an input node which matches, {node}.")
 
     def call(self, graph_module: GraphModule) -> PassResult:
-        matmul_partitions = get_source_partitions(
+        matmul_partitions_map = get_source_partitions(
             graph_module.graph,
             [
                 torch.matmul,
@@ -61,7 +60,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
             None,
         )
         matmul_partitions = list(
-            itertools.chain.from_iterable(matmul_partitions.values())
+            itertools.chain.from_iterable(matmul_partitions_map.values())
         )
         matmul_targets = {
             exir_ops.edge.aten.bmm.default,
@@ -89,7 +88,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
                         # Create new dq-node before matmul
                         dq_node = create_node(
                             graph=graph_module.graph,
-                            op_target=cast(EdgeOpOverload, input_node.target),  # type: ignore[arg-type]
+                            op_target=cast(EdgeOpOverload, input_node.target),
                         )
                         dq_node.args = (node, *input_node.args[1:])
                         matmul_node.replace_input_with(node, dq_node)
@@ -110,7 +109,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
                     # Create q-node after matmul
                     q_node = create_node(
                         graph=graph_module.graph,
-                        op_target=cast(EdgeOpOverload, partition_output.target),  # type: ignore[arg-type]
+                        op_target=cast(EdgeOpOverload, partition_output.target),
                     )
                     matmul_node.replace_all_uses_with(q_node)
                     q_node.args = (matmul_node, *partition_output.args[1:])
 
@@ -3,7 +3,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-unsafe
 
 import traceback
 from abc import abstractmethod
 
@@ -5,8 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-unsafe
-
 
 from collections import defaultdict
 
@@ -89,8 +87,7 @@
     QuantizeOperatorArguments,
     RemoveNoopPass,
     ReplaceInfValues,
-    ReplaceScalarWithTensorArgPassTOSABI,
-    ReplaceScalarWithTensorArgPassTOSAMI,
+    ReplaceScalarWithTensorByProfilePass,
     RetraceFoldedDtypesPass,
     RewriteConv2dPass,
     RewriteMatmulPass,
@@ -156,15 +153,15 @@ def _transform(self, graph_module: GraphModule):
         with TosaLoweringContext(self.tosa_spec):
             return self(graph_module).graph_module
 
-    def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
+    def _tosa_INT_pipeline(
+        self, exported_program: ExportedProgram, graph_module: GraphModule
+    ) -> GraphModule:
         self.add_pass(AnnotateOutputDimOrderPass())
         self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(RemoveGetItemPass())
         self.add_pass(ConvertSplitToSlicePass())
         self.add_pass(ConvertMmToBmmPass())
-        self.add_pass(
-            DecomposeMeanDimPass(exported_program.graph_module, self.tosa_spec)
-        )
+        self.add_pass(DecomposeMeanDimPass(graph_module, self.tosa_spec))
         self.add_pass(ConvertFullLikeToFullPass())
         self.add_pass(ConvertToClampPass())
         self.add_pass(ConvertMinMaxPass())
@@ -174,7 +171,7 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
             self.add_pass(CastToInt32Pass())
 
         self.add_pass(CastBoolToInt8Pass())
-        self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
+        self.add_pass(ReplaceScalarWithTensorByProfilePass())
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeOperatorArguments())
         self.add_pass(ConvertELUParamsPass())
@@ -194,7 +191,6 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(ConvertExpandCopyToRepeatPass())
         self.add_pass(UnsqueezeBeforeRepeatPass())
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
-        self.add_pass(DecomposeSumPass())
         self.add_pass(DecomposeCumsumPass(exported_program))
         self.add_pass(Conv1dUnsqueezePass())
         self.add_pass(DecomposeMaxPool2DPass())
@@ -215,15 +211,18 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(RewriteMatmulPass())
         self.add_pass(RewriteUpsamplePass())
         self.add_pass(FuseEqualPlaceholdersPass(exported_program))
+        self.add_pass(InsertRescaleInt32Pass())
+        self.add_pass(DecomposeSumPass())
         self.add_pass(ToTosaMemoryFormatPass(exported_program))
         self.add_pass(RemoveNoopPass())
         self.add_pass(InsertRescalePass())
-        self.add_pass(InsertRescaleInt32Pass())
 
         self.validate_constraints_mandatory()
-        return self._transform(exported_program.graph_module)
+        return self._transform(graph_module)
 
-    def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
+    def _tosa_FP_pipeline(
+        self, exported_program: ExportedProgram, graph_module: GraphModule
+    ) -> GraphModule:
         self.add_pass(AnnotateOutputDimOrderPass())
         self.add_pass(DecomposeExpm1Pass())
         self.add_pass(DecomposeLogitPass())
@@ -244,7 +243,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(DecomposeSinhPass())
         self.add_pass(DecomposeSignPass())
         self.add_pass(DecomposeDivTensorModePass())
-        self.add_pass(ReplaceScalarWithTensorArgPassTOSAMI())
+        self.add_pass(ReplaceScalarWithTensorByProfilePass())
         self.add_pass(DecomposeEmbeddingPass())
         self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(RemoveGetItemPass())
@@ -258,9 +257,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(DecomposeLayerNormPass())
         self.add_pass(DecomposeBatchNormNoStatsPass())
         self.add_pass(DecomposeVarPass())
-        self.add_pass(
-            DecomposeMeanDimPass(exported_program.graph_module, self.tosa_spec)
-        )
+        self.add_pass(DecomposeMeanDimPass(graph_module, self.tosa_spec))
         self.add_pass(DecomposeNotEqualPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeAddSubAlphaPass())
@@ -308,14 +305,16 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(InsertRescalePass())
 
         self.validate_constraints_mandatory()
-        return self._transform(exported_program.graph_module)
+        return self._transform(graph_module)
 
-    def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
+    def transform_to_backend_pipeline(
+        self, exported_program: ExportedProgram, graph_module: GraphModule
+    ):
         """Apply passes before transforming program to backend"""
         if self.tosa_spec == TosaSpecification.create_from_string("TOSA-1.0+FP"):
-            return self._tosa_FP_pipeline(exported_program)
+            return self._tosa_FP_pipeline(exported_program, graph_module)
         elif self.tosa_spec == TosaSpecification.create_from_string("TOSA-1.0+INT"):
-            return self._tosa_INT_pipeline(exported_program)
+            return self._tosa_INT_pipeline(exported_program, graph_module)
         else:
             raise NotImplementedError(
                 f"No pass pipeline implemented for {self.tosa_spec=}"
@@ -337,7 +336,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeAddmmPass())
         self.add_pass(DecomposeDivTensorModePass())
         self.add_pass(DecomposeAddSubAlphaPass())
-        self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
+        self.add_pass(ReplaceScalarWithTensorByProfilePass())
         self.add_pass(ScalarsToAttributePass())
         self.add_pass(DecomposeGroupNormPass())
         self.add_pass(DecomposeLayerNormPass())
@@ -361,7 +360,6 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
 
         self.add_pass(ConvertMinMaxPass())
         self.add_pass(ReplaceInfValues())
-        self.add_pass(DecomposeSumPass())
 
         if not self.tosa_spec.is_U55_subset:
             # Uses where which is not supported on Ethos-U55
 
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-unsafe
 
 import traceback
 from inspect import isclass
@@ -14,8 +13,10 @@
 import torch
 import torch.fx
 from executorch.backends.arm.common.debug import get_node_debug_info
+from executorch.backends.arm.common.type import ensure_type
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.dialects.edge._ops import EdgeOpOverload
 
 from torch._export.utils import (
     get_buffer,
@@ -82,17 +83,18 @@ def get_param_tensor(
     elif is_lifted_tensor_constant(exp_prog, node):
         return get_lifted_tensor_constant(exp_prog, node)
     elif is_get_attr_node(node):
+        target_node = ensure_type(str, node.target)
         # This is a hack to support both lifted and unlifted graph
         try:
-            return getattr(node.graph.owning_module, node.target)  # type: ignore[arg-type]
+            return getattr(node.graph.owning_module, target_node)
         except AttributeError:
-            return getattr(exp_prog.graph_module, node.target)  # type: ignore[arg-type]
+            return getattr(exp_prog.graph_module, target_node)
     raise RuntimeError(f"unsupported param type, {node.op}.")
 
 
 def create_node(
     graph: torch.fx.Graph,
-    op_target: OpOverload,
+    op_target: OpOverload | EdgeOpOverload,
     args: tuple = (),
     kwargs: Optional[dict] = None,
     quantize: bool = False,
 
@@ -3,7 +3,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# pyre-unsafe
 
 import logging
 from typing import Set, Type