From 23d334b2e05f5fdc7320384c8b47c370c602dd54 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 19 Jul 2022 11:38:17 -0700
Subject: [PATCH 01/29] Add back support for PYTORCH_TEST_WITH_MPS (#66)

Fix the TEST_WITH_MPS macro.
---
 torch/testing/_internal/common_device_type.py | 8 +++-----
 torch/testing/_internal/common_utils.py       | 1 +
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 8e34ec10a835..75e87155c7ca 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -13,7 +13,7 @@
 import torch.backends.mps
 from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \
     skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \
-    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, \
+    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, TEST_WITH_MPS, \
     _TestParametrizer, compose_parametrize_fns, dtype_name, \
     NATIVE_DEVICES, skipIfTorchDynamo
 from torch.testing._internal.common_cuda import _get_torch_cuda_version, \
@@ -555,10 +555,8 @@ def get_device_type_test_bases():
         test_bases.append(CPUTestBase)
         if torch.cuda.is_available():
             test_bases.append(CUDATestBase)
-        # Disable MPS testing in generic device testing temporarily while we're
-        # ramping up support.
-        # elif torch.backends.mps.is_available():
-        #   test_bases.append(MPSTestBase)
+        elif torch.backends.mps.is_available():
+          test_bases.append(MPSTestBase)
 
     return test_bases
 
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 03193f5ed7b2..66466c56aa3a 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -896,6 +896,7 @@ def _check_module_exists(name: str) -> bool:
 TEST_WITH_TSAN = os.getenv('PYTORCH_TEST_WITH_TSAN', '0') == '1'
 TEST_WITH_UBSAN = os.getenv('PYTORCH_TEST_WITH_UBSAN', '0') == '1'
 TEST_WITH_ROCM = os.getenv('PYTORCH_TEST_WITH_ROCM', '0') == '1'
+TEST_WITH_MPS = os.getenv('PYTORCH_TEST_WITH_MPS', '0') == '1'
 
 # Enables tests that are slow to run (disabled by default)
 TEST_WITH_SLOW = os.getenv('PYTORCH_TEST_WITH_SLOW', '0') == '1'

From 12085cffc8cc30658617b147da9db143026d5591 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 24 Jan 2023 12:37:40 -0800
Subject: [PATCH 02/29] Enable MPS CI runners (#252)

* Test MPS CI runners

* Cherry pick remaining files

* Enable lintrunner:

* Change lint  runner

* Retrigger checks

* Retrigger checks #2

* Retrigger checks #3

* Retrigger checks #4

* Retrigger checks #5

* Retrigger checks #5

* Retrigger checks #7

* Retrigger checks #8

* Retrigger checks #9

* Retrigger checks #9 (change arch to arm)

* Retrigger checks #10

* Retrigger checks #11

* Retrigger checks #12

* Retrigger checks #13

* Retrigger checks #14

* Retrigger checks #14

* Retrigger checks #15

* Retrigger checks #16

* Retrigger checks #16

* Retrigger checks #17

* Retrigger checks #19

* Retrigger checks #20

* Retrigger checks #21

* Fix lintrunner

* Fix lintrunner

* Remove lint.json
---
 .github/workflows/_mac-build.yml   |   4 +-
 .github/workflows/_mac-test.yml    |   6 +
 .github/workflows/check-labels.yml |  44 ---
 .github/workflows/lint.yml         |  69 ++++-
 .github/workflows/mac-mps.yml      |  11 +-
 test/test_mps.py                   | 468 ++++++++++++++++++++++++++++-
 6 files changed, 547 insertions(+), 55 deletions(-)
 delete mode 100644 .github/workflows/check-labels.yml

diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index f5f66ae5129b..58c70125b711 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -63,8 +63,8 @@ on:
 
 jobs:
   build:
-    # Don't run on forked repos.
-    if: github.repository_owner == 'pytorch'
+    # # Don't run on forked repos.
+    # if: github.repository_owner == 'pytorch'
     runs-on: ${{ inputs.runner-type }}
     env:
       # For sccache access (only on non-forked PRs)
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index d8ede95f2958..f61a3d28a345 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -182,6 +182,12 @@ jobs:
         run: |
           cat test/**/*.log || true
 
+      - name: Print remaining test logs
+        shell: bash
+        if: always()
+        run: |
+          cat test/**/*.log || true
+
       - name: Get workflow job id
         id: get-job-id
         uses: ./.github/actions/get-workflow-job-id
diff --git a/.github/workflows/check-labels.yml b/.github/workflows/check-labels.yml
deleted file mode 100644
index 5fa5fed16daf..000000000000
--- a/.github/workflows/check-labels.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-name: Check Labels
-
-on:
-  pull_request:
-    types: [opened, synchronize, reopened, labeled, unlabeled]
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  check-labels:
-    name: Check labels
-    runs-on: linux.20_04.4x
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: 1
-
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.8'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/.github/requirements-gha-cache.txt
-
-      - name: Install requirements
-        id: requirements
-        run: |
-          pip install -r .github/requirements-gha-cache.txt --user
-
-      - name: Check labels
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          PR_NUM: ${{ github.event.number }}
-        run: |
-          set -ex
-          python3 .github/scripts/check_labels.py "${PR_NUM}"
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 5dc152286e50..98a941d48b83 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -5,9 +5,6 @@ on:
   push:
     branches:
       - master
-      - main
-      - release/*
-      - landchecks/*
   workflow_dispatch:
 
 # The names of steps that actually test the code should be suffixed with `(nonretryable)`.
@@ -251,6 +248,72 @@ jobs:
           # All we need to see is that it passes
           python3 torch/utils/collect_env.py
 
+    runs-on: macos-m1-12
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+        with:
+          submodules: false
+          fetch-depth: 1
+
+      - name: Setup miniconda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: 3.9
+          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
+          # pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
+
+      - name: Install requirements
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          ${CONDA_RUN} python3 -m pip install --force-reinstall -r .github/requirements-gha-cache.txt
+
+      - name: Initialize lint dependencies
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          ${CONDA_RUN} lintrunner init
+
+      - name: Do build steps necessary for linters
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          ${CONDA_RUN} python3 -m tools.linter.clang_tidy.generate_build_files
+          ${CONDA_RUN} python3 -m tools.generate_torch_version --is_debug=false
+          ${CONDA_RUN} python3 -m tools.pyi.gen_pyi \
+            --native-functions-path aten/src/ATen/native/native_functions.yaml \
+            --tags-path aten/src/ATen/native/tags.yaml \
+            --deprecated-functions-path "tools/autograd/deprecated.yaml"
+
+      - name: Run lintrunner on all MPS files (nonretryable)
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+          PY_VERS: 3.9
+        shell: arch -arch arm64 bash {0}
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          set +e
+          if ! ${CONDA_RUN} lintrunner --force-color aten/src/ATen/native/mps/operations/* test/test_mps.py; then
+              echo ""
+              echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
+              echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
+              exit 1
+          fi
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
   cancel-in-progress: true
diff --git a/.github/workflows/mac-mps.yml b/.github/workflows/mac-mps.yml
index 663eac84514f..a2ca4867fd76 100644
--- a/.github/workflows/mac-mps.yml
+++ b/.github/workflows/mac-mps.yml
@@ -1,10 +1,11 @@
 name: Mac MPS
 
 on:
-  push:
-    tags:
-      - ciflow/mps/*
-  workflow_dispatch:
+  # push:
+  #   tags:
+  #     - ciflow/mps/*
+  # workflow_dispatch:
+  pull_request:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@@ -18,7 +19,7 @@ jobs:
       sync-tag: macos-12-py3-arm64-build
       build-environment: macos-12-py3-arm64
       xcode-version: "13.3.1"
-      runner-type: macos-12-xl
+      runner-type: macos-m1-13
       build-generates-artifacts: true
       # To match the one pre-installed in the m1 runners
       python_version: 3.9.12
diff --git a/test/test_mps.py b/test/test_mps.py
index b3740b5cd114..2b186d8f4c19 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1212,7 +1212,11 @@ def test_norm(self):
         self.assertEqual(res, res_cpu)
 
         c = torch.tensor([[1, 2, 3], [-1, 1, 4]], dtype=torch.float, device="mps")
+<<<<<<< HEAD
         c_cpu = torch.tensor([[1, 2, 3], [-1, 1, 4]] , dtype=torch.float, device="cpu")
+=======
+        c_cpu = torch.tensor([[1, 2, 3], [-1, 1, 4]], dtype=torch.float, device="cpu")
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
         res = torch.norm(c, dim=0)
         res_cpu = torch.norm(c_cpu, dim=0)
@@ -2371,12 +2375,21 @@ def helper(x, return_inverse, return_counts):
 
             self.assertEqual(result, result_cpu)
         helper(torch.tensor([1, 2, 4, 2, 1]), False, False)
+<<<<<<< HEAD
         helper(torch.randint(3, (10, )), False, False)
         helper(torch.randint(3, (10, )), True, False)
         helper(torch.randint(3, (10, )), False, True)
         helper(torch.randint(3, (10, )), True, True)
         helper(torch.randint(3, (1, )), True, True)
         helper(torch.randint(3, (0, )), True, True)
+=======
+        helper(torch.randint(3, (10,)), False, False)
+        helper(torch.randint(3, (10,)), True, False)
+        helper(torch.randint(3, (10,)), False, True)
+        helper(torch.randint(3, (10,)), True, True)
+        helper(torch.randint(3, (1,)), True, True)
+        helper(torch.randint(3, (0,)), True, True)
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
     def test_unique_consecutive(self):
         def helper(x, dim, return_inverse, return_counts):
@@ -2388,6 +2401,7 @@ def helper(x, dim, return_inverse, return_counts):
 
             self.assertEqual(result, result_cpu)
         helper(torch.tensor([1, 2, 4, 2, 1]), 0, False, False)
+<<<<<<< HEAD
         helper(torch.randint(3, (10, )), 0, False, False)
         helper(torch.randint(3, (10, )), 0, True, False)
         helper(torch.randint(3, (10, )), 0, False, True)
@@ -2395,6 +2409,15 @@ def helper(x, dim, return_inverse, return_counts):
         helper(torch.randint(3, (10, )), 0, True, True)
         helper(torch.randint(3, (1, )), 0, True, True)
         helper(torch.randint(3, (0, )), 0, True, True)
+=======
+        helper(torch.randint(3, (10,)), 0, False, False)
+        helper(torch.randint(3, (10,)), 0, True, False)
+        helper(torch.randint(3, (10,)), 0, False, True)
+        helper(torch.randint(3, (10,)), 0, True, True)
+        helper(torch.randint(3, (10,)), 0, True, True)
+        helper(torch.randint(3, (1,)), 0, True, True)
+        helper(torch.randint(3, (0,)), 0, True, True)
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
         helper(torch.tensor([[1, 1, 2, 3, 3, 2], [1, 1, 1, 2, 2, 1]]), 0, False, False)
         helper(torch.tensor([[1, 1, 2, 3, 3, 2], [1, 1, 1, 2, 2, 1]]), 0, True, True)
@@ -4776,6 +4799,11 @@ def helper(shape, padding, op, value=0):
         helper((2, 4, 6), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
         # check the workaround for the right padding bug in Monterey
         helper((1, 2, 2, 2, 2), (0, 1), nn.ConstantPad3d)
+<<<<<<< HEAD
+=======
+        # input size < pad size
+        helper((2, 4, 6), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
     # Test stack forward
     def test_stack(self):
@@ -5058,6 +5086,7 @@ def helper(shape, dim=0):
             for dim in range(len(shape)):
                 helper(shape, dim)
 
+<<<<<<< HEAD
     # Test softplus
     def test_softplus(self):
         def helper(shape, beta=1, threshold=20):
@@ -5081,8 +5110,31 @@ def helper(shape, beta=1, threshold=20):
             for beta in [0.5, 1, 2, 3, 4]:
                 for threshold in [0.5, 20, 30, 40, 50]:
                     helper(shape, beta, threshold)
+=======
+    # # Test softplus
+    # def test_softplus(self):
+    #     def helper(shape, beta=1, threshold=20):
+    #         cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
+    #         x = cpu_x.detach().clone().to('mps').requires_grad_()
+
+    #         softplus_result = torch.nn.Softplus(beta=beta, threshold=threshold)(x)
+    #         softplus_result_cpu = torch.nn.Softplus(beta=beta, threshold=threshold)(cpu_x)
+
+    #         cpu_grad = torch.randn(softplus_result.shape)
+    #         grad = cpu_grad.to('mps')
+
+    #         softplus_result.backward(gradient=grad)
+    #         softplus_result_cpu.backward(gradient=cpu_grad)
+
+    #         self.assertEqual(softplus_result, softplus_result_cpu)
+    #         self.assertEqual(x.grad, cpu_x.grad)
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
-    # Test silu
+    #     # Test empty shape too
+    #     for shape in [(), (2, 3), (10, 10), (2, 3, 4, 5)]:
+    #         for beta in [0.5, 1, 2, 3, 4]:
+    #             for threshold in [0.5, 20, 30, 40, 50]:
+    #                 helper(shape, beta, threshold)
 
     def test_silu(self):
         def helper(shape):
@@ -5776,7 +5828,11 @@ def helper(shape, dim, idx_shape, src_shape, idx_dtype=torch.int64, reduce_str="
 
         # for reduce in ["sum", "prod", "amax", "amin"]:
         for reduce_type in ["add", "multiply"]:
+<<<<<<< HEAD
             helper((2, 3), 0, (5, 3), (5, 3), reduce_str=reduce_type)
+=======
+            helper((2, 3), 0, (5, 3), (5, 3), reduce_str=reduce)
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
             helper((2, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce_type)
             helper((8, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce_type)
             helper((8, 8, 4, 5), 0, (4, 7, 3, 2), (4, 7, 3, 2), reduce_str=reduce_type)
@@ -9511,6 +9567,7 @@ class TestConsistency(TestCaseMPS):
     # If the dtype list is None, all dtypes are excluded.
     # All the entries in this list should be removed
     BLOCKLIST = {
+<<<<<<< HEAD
         # Functions that hang
         'masked_fill': [torch.bool, torch.uint8, torch.float32], 'where': [torch.bool],
         # + forward when requires_grad=True or running backward
@@ -9621,6 +9678,393 @@ class TestConsistency(TestCaseMPS):
         'inner': None,
         'dstack': None,
         'take_along_dim': None,
+=======
+        # Functions that hard crash
+        'nn.functional.softplus': [torch.float32],
+        'median': [torch.float32, torch.int16, torch.int32, torch.uint8, torch.int16],
+        'sgn': [torch.bool],
+        'linalg.inv': [torch.float32],
+        'linalg.inv_ex': [torch.float32],
+        'linalg.matrix_power': [torch.float32],
+        'nn.functional.interpolate': [torch.float32],
+        'resize_': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.interpolatearea': [torch.float32],
+        'resize_as_': [torch.float16, torch.float32],
+        'topk': [torch.int16, torch.int32, torch.int64, torch.uint8],
+
+        # Functions with correctness issues
+        'unique': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'divfloor_rounding': [torch.int16, torch.int32, torch.int64],
+        'divtrunc_rounding': [torch.float16],
+        'norm': [torch.float16],
+        'nn.functional.feature_alpha_dropoutwith_train': [torch.float32],
+        'cumulative_trapezoid': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'addr': [torch.float16],
+        'as_stridedpartial_views': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'trace': [torch.int64],
+        'normalnumber_mean': [torch.float16, torch.float32],
+        'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'multinomial': [torch.float32],
+        'floor_divide': [torch.int16, torch.int32, torch.int64],
+        'dist': [torch.float16],
+
+        # failure due to issue: atan2() may generate NAN in output with
+        'atan2': [torch.bool, torch.int16, torch.int32, torch.uint8],
+
+        # Unsupported Border padding mode
+        'grid_sampler_2d': [torch.float32],
+        'nn.functional.grid_sample': [torch.float32],
+
+        # failures due to issue #103039644: Wrong results from avgPooling2DWithSourceTensor()
+        # when both ceilMode and includeZeroPadToAverage are True
+        'nn.functional.avg_pool1d': [torch.float32, torch.int64],
+        'nn.functional.avg_pool2d': [torch.float32, torch.int64],
+        'nn.functional.adaptive_avg_pool1d': [torch.float32],
+        'nn.functional.adaptive_avg_pool2d': [torch.float32],
+    }
+
+    UNIMPLEMENTED_OPS = {
+        # Failures due to lack of op implementation on MPS backend
+        'linalg.eig': [torch.float32],
+        'linalg.eigvals': [torch.float32],
+        'fft.fft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.rfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.rfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.rfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'stft': [torch.float32],
+        'nn.functional.conv_transpose3d': [torch.int64, torch.float32],
+        'rounddecimals_neg_3': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'rounddecimals_3': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'rounddecimals_0': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rmod__': [torch.float16, torch.float32],
+        '__rsub__': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'aminmax': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'angle': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'argsort': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'bucketize': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cholesky': [torch.float32],
+        'cholesky_inverse': [torch.float32],
+        'cholesky_solve': [torch.float32],
+        'copysign': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cummax': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cummin': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cumprod': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'digamma': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'erfc': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'erfinv': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fmax': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fmin': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fmod': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'frexp': [torch.float16, torch.float32],
+        'gcd': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'geqrf': [torch.float32],
+        'heaviside': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'histc': [torch.float32],
+        'histogram': [torch.float32],
+        'histogramdd': [torch.float32],
+        'hypot': [torch.float32],
+        'i0': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'igamma': [torch.float16, torch.float32],
+        'igammac': [torch.float16, torch.float32],
+        'index_copy': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'index_fill': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'index_reduce': [torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'isin': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'isneginf': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'isposinf': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'kthvalue': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'lcm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'ldexp': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'lerp': [torch.float32],
+        'lgamma': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'linalg.cholesky': [torch.float32],
+        'linalg.cholesky_ex': [torch.float32],
+        'linalg.cond': [torch.float32],
+        'linalg.detsingular': [torch.float32],
+        'linalg.det': [torch.float32],
+        'linalg.eigh': [torch.float32],
+        'linalg.eigvalsh': [torch.float32],
+        'linalg.householder_product': [torch.float32],
+        'linalg.ldl_factor': [torch.float32],
+        'linalg.ldl_factor_ex': [torch.float32],
+        'linalg.ldl_solve': [torch.float32],
+        'linalg.lstsq': [torch.float32],
+        'linalg.lstsqgrad_oriented': [torch.float32],
+        'linalg.lu': [torch.float32],
+        'linalg.lu_factor': [torch.float32],
+        'linalg.lu_factor_ex': [torch.float32],
+        'linalg.lu_solve': [torch.float32],
+        'linalg.matrix_norm': [torch.float32],
+        'linalg.norm': [torch.float32],
+        'linalg.normsubgradients_at_zero': [torch.float32],
+        'linalg.qr': [torch.float32],
+        'linalg.slogdet': [torch.float32],
+        'linalg.solve': [torch.float32],
+        'linalg.solve_ex': [torch.float32],
+        'linalg.svdvals': [torch.float32],
+        'linalg.tensorsolve': [torch.float32],
+        'linalg.vander': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'linalg.vecdot': [torch.float32],
+        'logcumsumexp': [torch.float32],
+        'logdet': [torch.float32],
+        'logit': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'lu': [torch.float32],
+        'lu_solve': [torch.float32],
+        'lu_unpack': [torch.float32],
+        'masked.cumprod': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'masked.median': [torch.float32],
+        'masked_scatter': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'matrix_exp': [torch.float32],
+        'mode': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'msort': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mvlgamma': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mvlgammamvlgamma_p_1': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mvlgammamvlgamma_p_3': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mvlgammamvlgamma_p_5': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nanquantile': [torch.float32],
+        'nanmean': [torch.float32, torch.float16],
+        'nanmedian': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nansum': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'native_dropout_backward': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nextafter': [torch.float32],
+        'normnuc': [torch.float32],
+        'nn.functional._scaled_dot_product_attention': [torch.float32],
+        'nn.functional.fractional_max_pool2d': [torch.float32],
+        'nn.functional.fractional_max_pool3d': [torch.float32],
+        'nn.functional.adaptive_avg_pool3d': [torch.float16, torch.float32],
+        'nn.functional.adaptive_max_pool3d': [torch.float32],
+        'nn.functional.interpolatebicubic': [torch.float32],
+        'nn.functional.interpolatelinear': [torch.float32],
+        'nn.functional.interpolatetrilinear': [torch.float32],
+        'nn.functional.max_unpool1dgrad': [torch.float32],
+        'nn.functional.max_unpool2dgrad': [torch.float32],
+        'nn.functional.max_unpool3dgrad': [torch.float32],
+        'nn.functional.avg_pool3d': [torch.float32, torch.int64],
+        'nn.functional.ctc_loss': [torch.float32],
+        'nn.functional.embedding_bag': [torch.float16, torch.float32],
+        'nn.functional.max_pool2d': [torch.float32],
+        'nn.functional.hardshrink': [torch.float32],
+        'nn.functional.hardsigmoid': [torch.float32],
+        'nn.functional.logsigmoid': [torch.float32],
+        'nn.functional.max_pool3d': [torch.float32],
+        'nn.functional.max_unpool1d': [torch.float32],
+        'nn.functional.max_unpool2d': [torch.float32],
+        'nn.functional.max_unpool3d': [torch.float32],
+        'nn.functional.mish': [torch.float32],
+        'nn.functional.multi_margin_loss': [torch.float32],
+        'nn.functional.multilabel_margin_loss': [torch.float32],
+        'nn.functional.multilabel_soft_margin_loss': [torch.float32],
+        'nn.functional.pdist': [torch.float32],
+        'nn.functional.rrelu': [torch.float32],
+        'nn.functional.softshrink': [torch.float32],
+        'nn.functional.unfold': [torch.float16, torch.float32],
+        'nn.functional.norm': [torch.float32],
+        'ormqr': [torch.float32],
+        'pca_lowrank': [torch.float32],
+        'pinverse': [torch.float32],
+        'polar': [torch.float32],
+        'polygamma': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_3': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'polygammapolygamma_n_4': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'qr': [torch.float32],
+        'quantile': [torch.float32],
+        'remainder': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'renorm': [torch.float16, torch.float32],
+        'roll': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'rsub': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reduceamax': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reduceamin': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reducemin': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reducemean': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reduceprod': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'scatter_reducesum': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'searchsorted': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'segment_reduce': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'segment_reduceoffsets': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'segment_reducelengths': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'sinc': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'sort': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.airy_ai': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.bessel_j0': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.bessel_j1': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.bessel_y0': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.bessel_y1': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.chebyshev_polynomial_t': [torch.bool,
+                                           torch.float16,
+                                           torch.float32,
+                                           torch.int16,
+                                           torch.int32,
+                                           torch.int64,
+                                           torch.uint8],
+        'special.chebyshev_polynomial_u': [torch.bool,
+                                           torch.float16,
+                                           torch.float32,
+                                           torch.int16,
+                                           torch.int32,
+                                           torch.int64,
+                                           torch.uint8],
+        'special.entr': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.erfcx': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.hermite_polynomial_h': [torch.bool,
+                                         torch.float16,
+                                         torch.float32,
+                                         torch.int16,
+                                         torch.int32,
+                                         torch.int64,
+                                         torch.uint8],
+        'special.hermite_polynomial_he': [torch.bool,
+                                          torch.float16,
+                                          torch.float32,
+                                          torch.int16,
+                                          torch.int32,
+                                          torch.int64,
+                                          torch.uint8],
+        'special.i0e': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.i1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.i1e': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.laguerre_polynomial_l': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.log_ndtr': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.modified_bessel_i0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.modified_bessel_i1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.modified_bessel_k0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.modified_bessel_k1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.ndtri': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.polygamma': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.polygammaspecial_polygamma_n_0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.scaled_modified_bessel_k0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.scaled_modified_bessel_k1': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.spherical_bessel_j0': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.xlog1py': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'special.zeta': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'std_mean': [torch.float16, torch.float32],
+        'std_meanunbiased': [torch.float16, torch.float32],
+        'svd_lowrank': [torch.float32],
+        'symeig': [torch.float32],
+        'take': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'to_sparse': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'var_mean': [torch.float16, torch.float32],
+        'var_meanunbiased': [torch.float16, torch.float32],
+        'vdot': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'view_as_complex': [torch.float16, torch.float32],
+        'xlogy': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+    }
+
+    EXPECTED_FAILURES = {
+        # Failures due to unsupported data types on MPS backend
+        'bfloat16': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'chalf': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.conv1d': [torch.int64],
+        'nn.functional.conv2d': [torch.int64],
+        'nn.functional.conv_transpose1d': [torch.int64],
+        'nn.functional.softminwith_dtype': [torch.bool,
+                                            torch.float16,
+                                            torch.float32,
+                                            torch.int16,
+                                            torch.int32,
+                                            torch.int64,
+                                            torch.uint8],
+        'log_softmaxwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'softmaxwith_dtype': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rmatmul__': [torch.int16, torch.int32, torch.uint8],
+        'addmmdecomposed': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'addbmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'addmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'addmv': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'baddbmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'bmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cdouble': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'cfloat': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'complex': [torch.float16, torch.float32],
+        'double': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'einsum': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.fft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.fft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.fftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.fftshift': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.hfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.hfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.hfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ifftshift': [torch.bool, torch.float32, torch.float16, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.ihfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.irfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.irfft2': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.irfftn': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'fft.rfft': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'float_power': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'full': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'full_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'inner': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'linalg.matrix_rank': [torch.float32],
+        'linalg.matrix_rankhermitian': [torch.float32],
+        'linalg.multi_dot': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'linalg.pinv': [torch.float32],
+        'linalg.pinvhermitian': [torch.float32],
+        'log_softmax': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'matmul': [torch.int16, torch.int32, torch.int64, torch.uint8],  # MPS device does not support mm for non-float inputs
+        'mm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'mv': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'new_full': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'new_ones': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'new_zeros': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.batch_norm': [torch.float32],
+        'nn.functional.bilinear': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.linear': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'nn.functional.softmin': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'ones_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'signal.windows.blackman': [torch.float16],
+        'signal.windows.cosine': [torch.float16],
+        'signal.windows.exponential': [torch.float16],
+        'signal.windows.gaussian': [torch.float16],
+        'signal.windows.general_cosine': [torch.float16],
+        'signal.windows.general_hamming': [torch.float16],
+        'signal.windows.hamming': [torch.float16],
+        'signal.windows.hann': [torch.float16],
+        'signal.windows.kaiser': [torch.float16],
+        'stft': [torch.float32],
+        'tensordot': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'zeros_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'bincount': [torch.int16, torch.int32, torch.int64, torch.uint8],
+
+        # failures due to issue #102048039: powerWithPrimaryTensor() with integer input may return wrong results
+        'pow': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rpow__': [torch.int16, torch.int32],
+    }
+
+    UNDEFINED_BEHAVIOUR = {
+        # Failures due to random output that they generate using
+        # Philox engine causing mismatch with CPU results
+        'uniform': [torch.float16, torch.float32],
+        'rand_like': [torch.float16, torch.float32],
+        'randint_like': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'randn_like': [torch.float16, torch.float32],
+        'bernoulli': [torch.float32],
+        'normal': [torch.float16, torch.float32, torch.float16, torch.float32],
+        'nn.functional.alpha_dropout': [torch.float32],
+        'nn.functional.dropout': [torch.float32],
+        'nn.functional.dropout2d': [torch.float32],
+        'nn.functional.dropout3d': [torch.float32],
+        # these fill tensors with uninitialized data, causing mismatch with CPU
+        'new_empty': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'empty_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'empty': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        # problem 103190467, as_strided_scatter has non-deterministic behavior when the update indices are not unique
+        'as_strided_scatter': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        # duplicate indices are used in the testcase - undefined behaviour
+        'index_put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
     }
 
     # Those ops worked on MacOS12, but broken on MacOS13
@@ -9642,6 +10086,20 @@ class TestConsistency(TestCaseMPS):
         'masked.var',
     }
 
+<<<<<<< HEAD
+=======
+    dirname = os.path.dirname(__file__)
+    filename = os.path.join(dirname, "cuda_results.yaml")
+    with open(filename) as f:
+        data = yaml.safe_load(f)
+    CUDA_RESULT = dict()
+    for key, value in data.items():
+        CUDA_RESULT[key] = torch.as_tensor(value)
+
+    MPS_SKIP_LIST = reduce(lambda x, y: dict(x, **y), (
+        FAST_MATH_PRECISION_ISSUES, BLOCKLIST, UNDEFINED_BEHAVIOUR, EXPECTED_FAILURES, UNIMPLEMENTED_OPS))
+
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
     # Used for accept mode only
     NEW_ALLOW_LIST = defaultdict(list)
     NEW_ALLOW_LIST_GRAD = defaultdict(list)
@@ -9728,8 +10186,16 @@ def get_samples():
                 self.assertEqual(cpu_out, mps_out, atol=atol, rtol=rtol)
 
             except Exception as e:
+<<<<<<< HEAD
                 if any(s in str(e).lower() for s in ["int64", "macos 13", "adaptive pool mps"]):
                     self.skipTest(f"Expected Runtime Error: {str(e)}")
+=======
+                if any(s in str(e).lower() for s in ["int64", "macos 13"]):
+                    self.skipTest(f"{str(e)}")
+
+                if op.name in self.CUDA_RESULT and self.compare_with_CUDA(op, mps_out, atol=atol, rtol=rtol):
+                    continue
+>>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
                 if not generate_new_truth:
                     raise e

From e8f89dfbb6c91ccad98e8eb2450fd079bafe28a2 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Sat, 4 Feb 2023 11:44:46 -0800
Subject: [PATCH 03/29] Use DISTRIBUTED=1 for MPS CI runners (#292)

* Use DISTRIBUTED=1 for MPS CI runners

* Disable openmp
---
 .ci/pytorch/macos-build.sh       | 2 +-
 .github/workflows/_mac-build.yml | 1 +
 .github/workflows/_mac-test.yml  | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.ci/pytorch/macos-build.sh b/.ci/pytorch/macos-build.sh
index dbba68081d3e..0b0b1e3599b3 100755
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@@ -37,7 +37,7 @@ cross_compile_arm64() {
   # Cross compilation for arm64
   # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
   # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
-  USE_DISTRIBUTED=0 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+  USE_DISTRIBUTED=1 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_OPENMP=OFF USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
 }
 
 compile_x86_64() {
diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index 58c70125b711..5a6483ad54b3 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -106,6 +106,7 @@ jobs:
           environment-file: ${{ inputs.environment-file }}
 
       - name: Install macOS homebrew dependencies
+        if: ${{ runner.arch == 'X64' }}
         run: |
           # Install dependencies
           brew install libomp
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index f61a3d28a345..fb4ceaad40be 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -128,6 +128,7 @@ jobs:
           echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
 
       - name: Install macOS homebrew dependencies
+        if: ${{ runner.arch == 'X64' }}
         run: |
           # Install dependencies
           brew install libomp

From 85cdb98935efe7029dbc572d7211fe8a9d6cf417 Mon Sep 17 00:00:00 2001
From: Kulin Seth <kulin_seth@apple.com>
Date: Tue, 14 Feb 2023 08:21:06 -0800
Subject: [PATCH 04/29] Update the test mps.

---
 test/cuda_results.yaml |  102 ++
 test/test_mps.py       | 2157 ++++++++++++++++++++++++----------------
 2 files changed, 1391 insertions(+), 868 deletions(-)
 create mode 100644 test/cuda_results.yaml

diff --git a/test/cuda_results.yaml b/test/cuda_results.yaml
new file mode 100644
index 000000000000..bc6e0948ae56
--- /dev/null
+++ b/test/cuda_results.yaml
@@ -0,0 +1,102 @@
+ConsistencyTest: {
+  nn.functional.conv_transpose2d:
+    [[[7.399066925048828, 4.4053635597229, -25.85348129272461,
+        58.88909149169922, -88.75193786621094, -18.98126983642578, 9.437820434570312],
+      [-59.78305435180664, -65.34088134765625, -108.04747009277344, 196.6062469482422,
+        71.39350891113281, 37.8786735534668, -69.55322265625], [92.78504943847656,
+        91.24403381347656, -94.33301544189453, 9.261059761047363, -182.10206604003906,
+        141.4270477294922, 146.89010620117188], [-14.363212585449219, 43.454036712646484,
+        -76.1098403930664, 242.9479522705078, 198.1458282470703, -49.77315139770508,
+        5.891449451446533], [-43.56822967529297, 4.782844066619873, -29.526945114135742,
+        65.15388488769531, 161.29757690429688, 118.60847473144531, 27.08570671081543],
+      [68.29853057861328, -11.507468223571777, 2.044086217880249, 11.003862380981445,
+        34.993282318115234, -21.256723403930664, 91.49512481689453], [-70.4466781616211,
+        69.04386138916016, 7.764842987060547, 7.61972713470459, -28.99899673461914,
+        54.575748443603516, -5.762258052825928]], [[-36.238487243652344, 37.29551696777344,
+        -22.012331008911133, -30.1353702545166, 33.82851028442383, 33.00322341918945,
+        2.7218000888824463], [-7.999058246612549, 122.72489929199219, -1.0639530420303345,
+        2.9564287662506104, -143.1276092529297, -110.75650024414062, 48.0764274597168],
+      [-91.0599136352539, -11.656601905822754, 69.62447357177734, 88.12522888183594,
+        337.3008728027344, -76.9416732788086, -110.24406433105469], [-108.1512451171875,
+        98.42401123046875, 142.46144104003906, -127.48089599609375, -3.367496967315674,
+        86.82833099365234, 86.29623413085938], [-14.339198112487793, -52.287410736083984,
+        171.43614196777344, 200.14817810058594, 200.35476684570312, -189.4150390625,
+        -46.86980056762695], [30.196495056152344, 25.22877311706543, 95.29426574707031,
+        4.455311298370361, 118.48747253417969, 87.11080932617188, -83.6124038696289],
+      [-2.5434072017669678, 91.8791732788086, -10.615175247192383, -12.58531379699707,
+        -49.3439826965332, 33.37324523925781, -5.983145713806152]], [[4.551003932952881,
+        15.84842586517334, -46.354671478271484, 14.721636772155762, 39.01048278808594,
+        49.70054244995117, -18.268564224243164], [16.728954315185547, 129.43505859375,
+        -4.6139116287231445, -3.382319688796997, -238.76353454589844, 13.42194938659668,
+        40.393280029296875], [-2.335604429244995, -85.94283294677734, -142.2253875732422,
+        135.27537536621094, 18.01512336730957, -26.331714630126953, -33.35443878173828],
+      [-79.17593383789062, -93.72674560546875, -110.94194030761719, -61.455223083496094,
+        6.811624526977539, 129.06478881835938, 12.435402870178223], [10.859378814697266,
+        41.3059196472168, 143.55824279785156, -41.754737854003906, -235.32406616210938,
+        -70.98460388183594, 130.46929931640625], [193.57574462890625, -142.5060272216797,
+        -102.45012664794922, 124.68048095703125, 136.05215454101562, -9.650590896606445,
+        -45.59521484375], [-37.829593658447266, 39.12519454956055, 9.293094635009766,
+        -18.8004093170166, -0.7294210195541382, 51.884910583496094, 36.15913391113281]],
+    [[-15.651233673095703, 16.31340980529785, -26.752052307128906, 6.281721115112305,
+        43.765541076660156, -13.097319602966309, -30.443206787109375], [10.67841911315918,
+        66.1829605102539, -9.394262313842773, -131.45101928710938, -38.621002197265625,
+        65.9507064819336, 48.76960372924805], [-76.0918197631836, -9.108996391296387,
+        13.64936637878418, 96.7411880493164, 124.2474365234375, -111.50318145751953,
+        -42.397071838378906], [-83.31562805175781, 32.27967071533203, 250.08163452148438,
+        58.24131393432617, 129.95318603515625, -10.683560371398926, -123.84668731689453],
+      [-11.536887168884277, -15.220125198364258, 197.18821716308594, -31.680112838745117,
+        -81.35874938964844, 157.96974182128906, 105.61251831054688], [78.15926361083984,
+        -84.49744415283203, -73.91180419921875, 86.370361328125, 77.87918090820312,
+        55.3555908203125, -7.273794651031494], [25.232547760009766, 30.352109909057617,
+        53.722267150878906, 44.87421798706055, 44.618812561035156, 4.511796951293945,
+        9.039834976196289]]]
+}
+UnitTest: {
+  norm: 
+  [
+    {
+      dtype: f16,
+      args: [[[ 8.9453,  4.0859,  0.1230,  2.1367, -5.0000],
+        [ 7.2773, -4.6953, -3.5586,  8.2812, -0.8789],
+        [ 0.7119, -1.4854,  6.8633, -7.9805, -3.6562],
+        [-1.0195, -7.2695, -0.0264, -3.5078, -0.2900],
+        [ 8.7656,  5.8984, -2.3125, -0.0352,  5.2812]],],
+      params: [0.5,],
+      res: [2000.]
+    },
+    {
+      dtype: f16,
+      args: [[[[ 8.9219,  3.0508, -3.0234, -5.6250, -5.3516],
+         [-5.8906,  5.2109, -7.2500,  7.3047, -0.1846],
+         [-2.1367, -8.8047, -3.4727, -3.0859,  4.9062],
+         [ 2.1797, -8.5078,  6.1445, -5.0547,  2.8828],
+         [-2.6191,  4.6680, -4.1758,  8.7734, -5.4844]],
+
+        [[-5.8984,  7.3281, -7.3672, -0.0879,  7.0039],
+         [ 2.0117, -6.4258,  8.6250,  2.5137, -2.2676],
+         [-7.2578,  1.6875,  7.8750,  7.5078,  0.8350],
+         [-4.8164, -3.6914, -3.9199,  4.9219, -4.6680],
+         [ 5.0547, -7.1289,  2.3633,  3.7793, -7.4375]],
+
+        [[-8.6953, -3.8750,  0.8965, -4.4453,  6.1328],
+         [ 8.6719,  2.5586, -3.0664, -7.7891,  2.5234],
+         [ 5.8008,  0.5977,  4.9219,  3.0156,  3.6211],
+         [-6.0898, -3.4883,  2.6543,  7.1992,  5.9414],
+         [-3.6035,  8.3906,  2.2070, -1.1162,  7.2852]],
+
+        [[-2.4531, -2.9180,  6.2422, -6.3711, -8.3516],
+         [ 3.3398, -8.5078, -8.9375, -2.0312, -4.3320],
+         [-1.4326, -4.5000, -0.3252, -6.8555, -8.2969],
+         [ 5.8438,  5.6094, -6.6797, -0.0439,  3.6035],
+         [ 4.5859,  7.1016, -0.8086,  5.6953,  0.5098]],
+
+        [[ 3.0859,  4.4844,  0.6152,  7.9609, -7.6562],
+         [-0.7998, -3.4023,  5.7734, -2.4785,  5.9219],
+         [ 7.1094,  1.4502, -7.1289,  4.7188, -4.8359],
+         [ 2.7422, -1.9512,  5.6602, -3.6387, -8.6953],
+         [-4.6953,  0.2900,  2.7148, -0.0176,  7.6992]]],],
+      params: [1.5],
+      res: [125.2500]
+    },
+  ],
+}
diff --git a/test/test_mps.py b/test/test_mps.py
index 2b186d8f4c19..2085d0cebe72 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -16,8 +16,10 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import itertools
+import yaml
+import platform
 from collections import defaultdict
-from torch import inf
+from torch._six import inf
 from torch.nn import Parameter
 from torch.testing._internal import opinfo
 from torch.testing._internal.common_utils import \
@@ -26,9 +28,10 @@
 from torch.testing import make_tensor
 from torch.testing._comparison import TensorLikePair
 from torch.testing._internal.common_dtype import get_all_dtypes, integral_types
+import torch.mps
 import torch.backends.mps
 from torch.distributions import Uniform, Exponential
-from functools import partial
+from functools import partial, reduce
 
 from torch.testing._internal.common_methods_invocations import (
     op_db,
@@ -62,6 +65,8 @@
     TestCase = object  # noqa: F811
     NNTestCase = object  # noqa: F811
 
+product_version = float('.'.join(platform.mac_ver()[0].split('.')[:2]))
+
 # Determine whether to enable MPS memory leak check (uses same code as CUDA).
 TEST_MPS_MEM_LEAK_CHECK = os.getenv('PYTORCH_TEST_MPS_MEM_LEAK_CHECK', '0') == '1'
 
@@ -371,6 +376,15 @@ def test_avg_pool2d_ceil_mode(self):
 
 
 class TestMPS(TestCaseMPS):
+    def help_extra_unit(self, opname, op):
+        if opname not in OP_UNIT_TEST:
+            return
+        for test in OP_UNIT_TEST[opname]:
+            mps_args = test.sample()
+            mps_out = op(*mps_args)
+            mps_out = (mps_out, ) if isinstance(mps_out, torch.Tensor) else mps_out
+            self.assertEqual(test.expected(), mps_out)
+
     def test_exp(self, device="mps", dtype=torch.float):
         for v in (2, -2) + ((1j, 1 + 1j) if dtype.is_complex else ()):
             b = torch.arange(18, device="cpu") / 3 * math.pi
@@ -432,6 +446,53 @@ def helper(val, shape):
         helper(0, [1024])
         helper(0.2, [2, 3])
 
+    def test_mm(self):
+        B = torch.ones(5, 6).to("mps")
+        C = torch.ones(6, 5).to("mps")
+        D = torch.mm(B, C).cpu()
+        torch.testing.assert_close(D, torch.full((5, 5), 6.0))
+
+    def test_linalg_cross(self):
+        def helper(dtype):
+            device = "mps"
+            if dtype is torch.int32 or dtype is torch.int64:
+                x = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device)
+                y = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device)
+            else:
+                x = torch.rand(100, 3, 100, dtype=dtype, device=device)
+                y = torch.rand(100, 3, 100, dtype=dtype, device=device)
+            x_cpu = x.to("cpu")
+            y_cpu = y.to("cpu")
+            res1 = torch.linalg.cross(x, y, dim=1)
+            res2 = torch.tensor((), dtype=dtype, device=device)
+            res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1)
+            res2_cpu = torch.tensor((), dtype=dtype, device="cpu")
+            torch.linalg.cross(x, y, dim=1, out=res2)
+            torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu)
+            self.assertEqual(res1, res2)
+            self.assertEqual(res1, res1_cpu)
+            self.assertEqual(res2, res2_cpu)
+
+            # test for broadcastable inputs
+            if dtype is torch.int32 or dtype is torch.int64:
+                x = torch.randint(0, 99999, (1, 3, 2), dtype=dtype, device=device)
+                y = torch.randint(0, 99999, (4, 3, 1), dtype=dtype, device=device)
+            else:
+                x = torch.rand(1, 3, 2, dtype=dtype, device=device)
+                y = torch.rand(4, 3, 1, dtype=dtype, device=device)
+            x_cpu = x.to("cpu")
+            y_cpu = y.to("cpu")
+            res1 = torch.linalg.cross(x, y, dim=1)
+            res2 = torch.tensor((), dtype=dtype, device=device)
+            res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1)
+            res2_cpu = torch.tensor((), dtype=dtype, device="cpu")
+            torch.linalg.cross(x, y, dim=1, out=res2)
+            torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu)
+            self.assertEqual(res1, res2)
+            self.assertEqual(res1, res1_cpu)
+            self.assertEqual(res2, res2_cpu)
+        [helper(dtype) for dtype in [torch.int32, torch.int64, torch.float32]]
+
     def test_cdist_large(self, device="mps"):
         for cm in ['use_mm_for_euclid_dist_if_necessary', 'use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
             x = torch.randn(100, 10, device=device)
@@ -577,53 +638,6 @@ def test_cdist_norm_batch(self, device="mps"):
                             expected = self._brute_cdist(x, y, p=p)
                             self.assertEqual(expected, actual)
 
-    def test_mm(self):
-        B = torch.ones(5, 6).to("mps")
-        C = torch.ones(6, 5).to("mps")
-        D = torch.mm(B, C).cpu()
-        torch.testing.assert_close(D, torch.full((5, 5), 6.0))
-
-    def test_linalg_cross(self):
-        def helper(dtype):
-            device = "mps"
-            if dtype is torch.int32 or dtype is torch.int64:
-                x = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device)
-                y = torch.randint(0, 99999, (100, 3, 100), dtype=dtype, device=device)
-            else:
-                x = torch.rand(100, 3, 100, dtype=dtype, device=device)
-                y = torch.rand(100, 3, 100, dtype=dtype, device=device)
-            x_cpu = x.to("cpu")
-            y_cpu = y.to("cpu")
-            res1 = torch.linalg.cross(x, y, dim=1)
-            res2 = torch.tensor((), dtype=dtype, device=device)
-            res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1)
-            res2_cpu = torch.tensor((), dtype=dtype, device="cpu")
-            torch.linalg.cross(x, y, dim=1, out=res2)
-            torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu)
-            self.assertEqual(res1, res2)
-            self.assertEqual(res1, res1_cpu)
-            self.assertEqual(res2, res2_cpu)
-
-            # test for broadcastable inputs
-            if dtype is torch.int32 or dtype is torch.int64:
-                x = torch.randint(0, 99999, (1, 3, 2), dtype=dtype, device=device)
-                y = torch.randint(0, 99999, (4, 3, 1), dtype=dtype, device=device)
-            else:
-                x = torch.rand(1, 3, 2, dtype=dtype, device=device)
-                y = torch.rand(4, 3, 1, dtype=dtype, device=device)
-            x_cpu = x.to("cpu")
-            y_cpu = y.to("cpu")
-            res1 = torch.linalg.cross(x, y, dim=1)
-            res2 = torch.tensor((), dtype=dtype, device=device)
-            res1_cpu = torch.linalg.cross(x_cpu, y_cpu, dim=1)
-            res2_cpu = torch.tensor((), dtype=dtype, device="cpu")
-            torch.linalg.cross(x, y, dim=1, out=res2)
-            torch.linalg.cross(x_cpu, y_cpu, dim=1, out=res2_cpu)
-            self.assertEqual(res1, res2)
-            self.assertEqual(res1, res1_cpu)
-            self.assertEqual(res2, res2_cpu)
-        [helper(dtype) for dtype in [torch.int32, torch.int64, torch.float32]]
-
     def test_cross(self):
         a = torch.randn(4, 3, device="mps")
         b = torch.randn(4, 3, device="mps")
@@ -640,6 +654,13 @@ def test_addmm(self):
         D = torch.addmm(A, B, C).to("cpu")
         torch.testing.assert_close(D, torch.full((5, 5), 7.0))
 
+    def test_addr(self):
+        A = torch.ones(5, 10).to("mps")
+        B = torch.ones(5).to("mps")
+        C = torch.ones(10).to("mps")
+        D = torch.addr(A, B, C).to("cpu")
+        torch.testing.assert_close(D, torch.full((5, 10), 2.0))
+
     def test_bmm(self):
         batch1_cpu = torch.randn(10, 3, 4)
         batch2_cpu = torch.randn(10, 4, 5)
@@ -653,13 +674,6 @@ def test_bmm(self):
         self.assertEqual(output_cpu, output_mps)
         self.assertEqual(output_cpu.size(), output_mps.size())
 
-    def test_addr(self):
-        A = torch.ones(5, 10).to("mps")
-        B = torch.ones(5).to("mps")
-        C = torch.ones(10).to("mps")
-        D = torch.addr(A, B, C).to("cpu")
-        torch.testing.assert_close(D, torch.full((5, 10), 2.0))
-
     def test_trace(self):
         M_cpu = torch.randn(3, 3)
         M_mps = M_cpu.detach().clone().to("mps")
@@ -1212,11 +1226,7 @@ def test_norm(self):
         self.assertEqual(res, res_cpu)
 
         c = torch.tensor([[1, 2, 3], [-1, 1, 4]], dtype=torch.float, device="mps")
-<<<<<<< HEAD
-        c_cpu = torch.tensor([[1, 2, 3], [-1, 1, 4]] , dtype=torch.float, device="cpu")
-=======
         c_cpu = torch.tensor([[1, 2, 3], [-1, 1, 4]], dtype=torch.float, device="cpu")
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
         res = torch.norm(c, dim=0)
         res_cpu = torch.norm(c_cpu, dim=0)
@@ -1241,6 +1251,8 @@ def test_norm(self):
         res_cpu = torch.norm(d_cpu[0, :, :]), torch.norm(d_cpu[1, :, :])
         self.assertEqual(res, res_cpu)
 
+        self.help_extra_unit('norm', torch.norm)
+
     def test_layer_norm(self):
         # TODO: Test non-contiguous
         def helper(input_shape, normalized_shape, eps=1e-05, elementwise_affine=True, dtype=torch.float32):
@@ -1822,12 +1834,6 @@ def test_view_slice(self):
                 actual_pts[i, j] = X[pts[i, j], j]
                 self.assertEqual(actual_pts[i, j], actual_pts_mps[i, j])
 
-    def test_slice_scatter(self):
-        shape = (4, 4)
-        tensor = torch.randint(10, shape, device="mps")
-        tensor_before = tensor.clone()
-        torch.empty(shape[0], shape[1] * 2, device="mps")[:, ::2].copy_(tensor)
-        torch.testing.assert_close(tensor, tensor_before)
 
     def test_slice(self):
         values = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
@@ -1987,99 +1993,6 @@ def helper(shape, repeats):
         helper((3, 4, 5), (2, 3, 4, 5))
         helper((3, 4, 5), (2, 2, 2))
 
-    def test_torch_repeat_interleave(self, device="mps"):
-        y = torch.tensor([[1, 2], [3, 4]], device=device)
-        # exercise single argument function signature
-        temp = y.repeat_interleave(2)
-        self.assertEqual(torch.Size([8]), temp.size())
-
-        for dtype in [torch.int, torch.long]:
-            lengths = torch.tensor([1, 2], dtype=dtype, device="mps")
-            output_size = torch.sum(lengths)
-            a = torch.repeat_interleave(
-                y,
-                lengths,
-                dim=0,
-            )
-            self.assertEqual(a.dtype, y.dtype)
-            self.assertEqual(a.size(), torch.Size([3, 2]))
-
-            a_with_output = torch.repeat_interleave(
-                y,
-                lengths,
-                dim=0,
-                output_size=output_size,
-            )
-            self.assertEqual(a_with_output.dtype, y.dtype)
-            self.assertEqual(a_with_output.size(), torch.Size([3, 2]))
-
-    def test_repeat_interleave(self, device="mps"):
-        x = torch.tensor([0, 1, 2, 3], device=device)
-        expected = torch.tensor([1, 2, 2, 3, 3, 3], dtype=torch.int32, device=device)
-        self.assertEqual(torch.repeat_interleave(x), expected)
-
-        with self.assertRaises(RuntimeError):
-            torch.repeat_interleave(torch.arange(4, device=device).reshape(2, 2))
-
-        with self.assertRaises(RuntimeError):
-            torch.repeat_interleave(torch.arange(4.0, device=device))
-
-        with self.assertRaises(RuntimeError):
-            torch.repeat_interleave(torch.tensor([1, 2, -1, 3, 4], device=device))
-
-        y = torch.tensor([[1, 2], [3, 4]], device=device)
-
-        y1_v1 = torch.repeat_interleave(y, 2)
-        y1_v2 = torch.repeat_interleave(y, torch.tensor(2, device=device))
-        y1_v3 = torch.repeat_interleave(y, torch.tensor([2], device=device))
-        y1_expect = torch.tensor([1, 1, 2, 2, 3, 3, 4, 4], device=device)
-        self.assertEqual(y1_v1, y1_expect)
-        self.assertEqual(y1_v2, y1_expect)
-        self.assertEqual(y1_v3, y1_expect)
-
-        y2 = torch.repeat_interleave(y, 3, dim=1)
-        y2_expect = torch.tensor([[1, 1, 1, 2, 2, 2],
-                                  [3, 3, 3, 4, 4, 4]], device=device)
-        self.assertEqual(y2, y2_expect)
-
-        y3 = torch.repeat_interleave(y, torch.tensor([1, 2], device=device), dim=0)
-        y3_expect = torch.tensor([[1, 2],
-                                  [3, 4],
-                                  [3, 4]], device=device)
-        self.assertEqual(y3, y3_expect)
-
-        with self.assertRaises(RuntimeError):
-            torch.repeat_interleave(y, torch.tensor([1, 2, 3], device=device), dim=0)
-
-        with self.assertRaises(RuntimeError):
-            torch.repeat_interleave(y, torch.arange(9, device=device).reshape(3, 3), dim=0)
-
-        # test zero sized dimension
-        x = torch.zeros((5, 0), device=device)
-        y = torch.repeat_interleave(x, repeats=3, dim=1)
-        self.assertEqual(y, x.new_zeros(5, 0, device=device))
-
-        x = torch.tensor([], dtype=torch.int64, device=device)
-        y = torch.repeat_interleave(x, x)
-        self.assertEqual(y, x)
-
-    def test_repeat_interleave_simple(self):
-        def helper(shape, dtype=torch.float32, num_repeats=torch.Tensor(), dim=None):
-            x = torch.randn(shape, dtype=dtype, device="mps")
-            x_cpu = x.detach().clone().cpu()
-
-            num_repeats_cpu = num_repeats.detach().clone().cpu()
-
-            repeats = torch.repeat_interleave(x, num_repeats, dim)
-            repeats_cpu = torch.repeat_interleave(x_cpu, num_repeats_cpu, dim)
-
-            self.assertEqual(repeats, repeats_cpu)
-        helper(shape=3, num_repeats=torch.tensor([100], device="mps"))
-        helper(shape=(2, 2), num_repeats=torch.tensor([3, 3], device="mps"), dim=0)
-        helper(shape=(10, 15, 8), num_repeats=torch.arange(10, device="mps"), dim=0)
-        helper(shape=(10, 15, 8), num_repeats=torch.randint(0, 100, (15, ), device="mps"), dim=1)
-        helper(shape=(10, 15, 30), num_repeats=torch.randint(0, 100, (30, ), device="mps"), dim=2)
-
     def test_count_nonzero(self):
         def helper(dtype):
             n = [
@@ -2155,6 +2068,15 @@ def test_to(self):
             x_mps = x_cpu.to('mps')
             self.assertEqual(x_mps.to(torch.float32), x_cpu.to(torch.float32))
 
+    @unittest.skipIf(True, "non-contiguous tensor to mps is incorrect.")
+    def test_to_non_contiguous(self):
+        x = torch.arange(16, dtype=torch.float32).reshape(2, 2, 2, 2)
+        x1 = x[:, :, :1, :]
+        x2 = x[:, :, 1:, :]
+        self.assertFalse(x1.is_contiguous())
+        self.assertFalse(x2.is_contiguous())
+        self.assertEqual(x1, x1.detach().to("mps"))
+        self.assertEqual(x2, x2.detach().to("mps"))
 
     def test_setitem_scalar(self) -> None:
         device = 'mps'
@@ -2228,9 +2150,9 @@ def test_storage_offset_greater_than_src_nbytes(self):
             tensor_list.append(t)
 
         for i in range(0, n_tensors - 1):
-            t = tensor_list[i].view(1, n_tensor_elems)
+            t = tensor_list[i].view(1, 784)
             t_mps = t.to("mps")
-            self.assertEqual(t, t_mps.cpu(), f"i={i}")
+            self.assertEqual(t, t_mps.cpu())
 
     # See https://github.com/pytorch/pytorch/issues/82427
     # and https://github.com/pytorch/pytorch/issues/83692
@@ -2242,6 +2164,7 @@ def test_full_bugs(self):
         y_cpu = torch.full((2, 2), 247, device='cpu', dtype=torch.uint8)
         self.assertEqual(y_mps, y_cpu)
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     # See https://github.com/pytorch/pytorch/issues/84995
     def test_div_bugs(self):
         for (dtype, mode) in itertools.product(integral_types(), ['trunc', 'floor']):
@@ -2308,6 +2231,7 @@ def ensure_tuple(x):
                 self.assertEqual(expected_inverse.view(additional_shape), y_inverse)
                 self.assertEqual(expected_counts, y_counts)
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_unique_all_dtypes(self, device="mps"):
         def helper(dtype):
             def ensure_tuple(x):
@@ -2363,7 +2287,7 @@ def ensure_tuple(x):
                                 if k == i:
                                     count += 1
                             self.assertEqual(j, count)
-        [helper(dtype) for dtype in [torch.float32, torch.int64, torch.int32, torch.int16, torch.uint8]]
+        [helper(dtype) for dtype in [torch.float32, torch.float16, torch.int64, torch.int32, torch.int16, torch.uint8]]
 
     def test_unique(self):
         def helper(x, return_inverse, return_counts):
@@ -2375,21 +2299,12 @@ def helper(x, return_inverse, return_counts):
 
             self.assertEqual(result, result_cpu)
         helper(torch.tensor([1, 2, 4, 2, 1]), False, False)
-<<<<<<< HEAD
-        helper(torch.randint(3, (10, )), False, False)
-        helper(torch.randint(3, (10, )), True, False)
-        helper(torch.randint(3, (10, )), False, True)
-        helper(torch.randint(3, (10, )), True, True)
-        helper(torch.randint(3, (1, )), True, True)
-        helper(torch.randint(3, (0, )), True, True)
-=======
         helper(torch.randint(3, (10,)), False, False)
         helper(torch.randint(3, (10,)), True, False)
         helper(torch.randint(3, (10,)), False, True)
         helper(torch.randint(3, (10,)), True, True)
         helper(torch.randint(3, (1,)), True, True)
         helper(torch.randint(3, (0,)), True, True)
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
     def test_unique_consecutive(self):
         def helper(x, dim, return_inverse, return_counts):
@@ -2401,15 +2316,6 @@ def helper(x, dim, return_inverse, return_counts):
 
             self.assertEqual(result, result_cpu)
         helper(torch.tensor([1, 2, 4, 2, 1]), 0, False, False)
-<<<<<<< HEAD
-        helper(torch.randint(3, (10, )), 0, False, False)
-        helper(torch.randint(3, (10, )), 0, True, False)
-        helper(torch.randint(3, (10, )), 0, False, True)
-        helper(torch.randint(3, (10, )), 0, True, True)
-        helper(torch.randint(3, (10, )), 0, True, True)
-        helper(torch.randint(3, (1, )), 0, True, True)
-        helper(torch.randint(3, (0, )), 0, True, True)
-=======
         helper(torch.randint(3, (10,)), 0, False, False)
         helper(torch.randint(3, (10,)), 0, True, False)
         helper(torch.randint(3, (10,)), 0, False, True)
@@ -2417,7 +2323,6 @@ def helper(x, dim, return_inverse, return_counts):
         helper(torch.randint(3, (10,)), 0, True, True)
         helper(torch.randint(3, (1,)), 0, True, True)
         helper(torch.randint(3, (0,)), 0, True, True)
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
         helper(torch.tensor([[1, 1, 2, 3, 3, 2], [1, 1, 1, 2, 2, 1]]), 0, False, False)
         helper(torch.tensor([[1, 1, 2, 3, 3, 2], [1, 1, 1, 2, 2, 1]]), 0, True, True)
@@ -2460,134 +2365,6 @@ def test_from_numpy_non_contiguous(self):
         t_mps = torch.tensor(a, device="mps")
         self.assertEqual(t_cpu, t_mps.to("cpu"))
 
-    # See https://github.com/pytorch/pytorch/issues/86954
-    def test_copy_non_contiguous(self):
-        x = torch.arange(27).reshape(3, 3, 3).permute(2, 0, 1)
-        self.assertFalse(x.is_contiguous())
-        y = x.to('mps')
-        self.assertFalse(y.is_contiguous())
-        self.assertEqual(x, y.to('cpu'))
-
-        x = torch.arange(4**3).reshape(4, 4, 4).permute((2, 0, 1))[1:, ::2]
-        y = x.to('mps')
-        self.assertEqual(x, y.to('cpu'))
-
-        x = torch.full((4, 4, 4, 4), 13, device="cpu")
-        y = torch.full((4, 4, 4, 4), 13, device="mps")
-        z = torch.arange(4**4).reshape(4, 4, 4, 4).permute(3, 2, 0, 1)[1::, ::2]
-        x.permute(3, 2, 1, 0)[1::, ::2] = z
-        # As y is on MPS and z on CPU, this dispatches to a copy operator
-        y.permute(3, 2, 1, 0)[1::, ::2] = z
-        self.assertEqual(x, y.to('cpu'))
-
-    # See https://github.com/pytorch/pytorch/pull/84742
-    # and https://github.com/pytorch/pytorch/pull/78319
-    def test_binops_dtype_precedence(self):
-        # Test dtype precedence (casting order) in binary operations by comparing to CPU result
-        # Example values for all dtypes supported on the MPS backend
-        sample_vals = {
-            torch.bool: [False, True],
-            torch.int16: [-15, 0, 1, 10],
-            torch.int32: [-376, 0, 1, 13],
-            torch.int64: [-8, 0, 1, 77],
-            torch.float16: [-234.5, 0.0, 1.0, 2.0],
-            torch.float32: [-1.0, 0.0, 0.1, 111.99],
-        }
-        # Test all combinations of dtypes, operations, dimensionality
-        for dtype1, dtype2, binop in itertools.product(
-                sample_vals.keys(), sample_vals.keys(), ['add', 'sub', 'mul', 'div']):
-            # bool minus bool is generally unsupported, so skip
-            if binop == 'sub' and (dtype1 == torch.bool or dtype2 == torch.bool):
-                continue
-            full_shape = (10,)
-            for val1, val2 in itertools.product(sample_vals[dtype1], sample_vals[dtype2]):
-                # print(f'{dtype1},{dtype2}: ({val1}).{binop}({val2})')
-                # print(getattr(torch.tensor(val1, dtype=dtype1, device='mps'), binop)
-                #            (torch.tensor(val2, dtype=dtype2, device='mps')))
-                # print(getattr(torch.tensor(val1, dtype=dtype1, device='cpu'), binop)
-                #            (torch.tensor(val2, dtype=dtype2, device='cpu')))
-                self.assertEqual(
-                    getattr(torch.tensor(val1, dtype=dtype1, device='mps'), binop)
-                           (torch.tensor(val2, dtype=dtype2, device='mps')),
-                    getattr(torch.tensor(val1, dtype=dtype1, device='cpu'), binop)
-                           (torch.tensor(val2, dtype=dtype2, device='cpu')))
-                self.assertEqual(
-                    getattr(torch.tensor([val1], dtype=dtype1, device='mps'), binop)
-                           (torch.tensor([val2], dtype=dtype2, device='mps')),
-                    getattr(torch.tensor([val1], dtype=dtype1, device='cpu'), binop)
-                           (torch.tensor([val2], dtype=dtype2, device='cpu')))
-                self.assertEqual(
-                    getattr(torch.tensor(val1, dtype=dtype1, device='mps'), binop)
-                           (torch.tensor([val2], dtype=dtype2, device='mps')),
-                    getattr(torch.tensor(val1, dtype=dtype1, device='cpu'), binop)
-                           (torch.tensor([val2], dtype=dtype2, device='cpu')))
-                self.assertEqual(
-                    getattr(torch.tensor([val1], dtype=dtype1, device='mps'), binop)
-                           (torch.tensor(val2, dtype=dtype2, device='mps')),
-                    getattr(torch.tensor([val1], dtype=dtype1, device='cpu'), binop)
-                           (torch.tensor(val2, dtype=dtype2, device='cpu')))
-                # Test tensors created with torch.full
-                x1 = torch.full(full_shape, val1, dtype=dtype1, device='mps')
-                y1 = torch.tensor(val2, dtype=dtype2, device='mps')
-                x2 = torch.full(full_shape, val1, dtype=dtype1, device='cpu')
-                y2 = torch.tensor(val2, dtype=dtype2, device='cpu')
-                self.assertEqual(getattr(x1, binop)(y1), getattr(x2, binop)(y2))
-                x3 = torch.tensor(val1, dtype=dtype1, device='mps')
-                y3 = torch.full(full_shape, val2, dtype=dtype2, device='mps')
-                x4 = torch.tensor(val1, dtype=dtype1, device='cpu')
-                y4 = torch.full(full_shape, val2, dtype=dtype2, device='cpu')
-                self.assertEqual(getattr(x3, binop)(y3), getattr(x4, binop)(y4))
-                self.assertEqual(
-                    getattr(torch.tensor(val1, dtype=dtype1, device='mps'), binop)
-                           (torch.full(full_shape, val2, dtype=dtype2, device='mps')),
-                    getattr(torch.tensor(val1, dtype=dtype1, device='cpu'), binop)
-                           (torch.full(full_shape, val2, dtype=dtype2, device='cpu')))
-
-    def test_nansum(self):
-        def helper(dtype, noncontiguous, dim):
-            zero_cpu = torch.zeros((), dtype=dtype)
-
-            # Randomly scale the values
-            scale = random.randint(10, 100)
-            x_cpu: torch.Tensor = make_tensor(
-                (5, 5), dtype=dtype, device='cpu',
-                low=-scale, high=scale, noncontiguous=noncontiguous)
-
-            if dtype.is_floating_point:
-                nan_mask_cpu = x_cpu < (0.2 * scale)
-                x_no_nan_cpu = torch.where(nan_mask_cpu, zero_cpu, x_cpu)
-                x_cpu[nan_mask_cpu] = np.nan
-            else:
-                x_no_nan_cpu = x_cpu
-
-            x_mps = x_cpu.to('mps')
-            actual_out_mps = torch.empty(0, dtype=dtype, device='mps')
-            expect_out_cpu = torch.empty(0, dtype=dtype)
-            dim_kwargs = {"dim": dim} if dim is not None else {}
-            expect = torch.sum(x_no_nan_cpu, **dim_kwargs)
-
-            actual_cpu = torch.nansum(x_cpu, **dim_kwargs)
-            # Sanity check on CPU
-            self.assertEqual(expect, actual_cpu)
-
-            # Test MPS
-            actual_mps = torch.nansum(x_mps, **dim_kwargs)
-            # Test out= variant
-            torch.nansum(x_mps, out=actual_out_mps, **dim_kwargs)
-            torch.nansum(x_cpu, out=expect_out_cpu, **dim_kwargs)
-            self.assertEqual(expect, actual_mps)
-            self.assertEqual(expect_out_cpu, actual_out_mps)
-
-        args = itertools.product(
-            (torch.float16, torch.float32, torch.int32, torch.int64),   # dtype
-            (True, False),                                              # noncontiguous
-            (0, 1, None),                                               # dim
-        )
-
-        for dtype, noncontiguous, dim in args:
-            with self.subTest(dtype=dtype, noncontiguous=noncontiguous, dim=dim):
-                helper(dtype, noncontiguous, dim)
-
     def test_cumsum_all_dtypes(self):
         def helper(dtype):
             t = torch.tensor([1, 1, 1, 1], device="mps", dtype=dtype)
@@ -2605,22 +2382,32 @@ def helper(dtype):
             e_string = str(e)
             self.assertEqual(e_string, "MPS does not support cumsum op with int64 input")
 
-    def test_cumsum_minus_one_axis(self):
-        def helper(dtype):
-            # Test with axis -1
-            cpu_x = None
-            if(dtype == torch.float32):
-                cpu_x = torch.randn(10, 3, device='cpu', dtype=torch.float32)
-            else:
-                cpu_x = torch.randint(0, 20, (10, 3), device='cpu', dtype=torch.float32)
+    def test_gelu_tanh(self):
+        def helper(shape):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
             x = cpu_x.detach().clone().to('mps')
 
-            cpu_y = cpu_x.cumsum(-1)
-            y = x.cumsum(-1)
+            gelu_tanh_result = torch.nn.functional.gelu(x, approximate='tanh')
+            gelu_tanh_result_cpu = torch.nn.functional.gelu(cpu_x, approximate='tanh')
+            self.assertEqual(gelu_tanh_result, gelu_tanh_result_cpu)
 
-            self.assertEqual(y, cpu_y)
+        helper((2, 8, 4, 5))
 
-        [helper(dtype) for dtype in [torch.float32, torch.int16, torch.int32, torch.uint8]]
+    # # Failures due to precision issues, enable after resolving from mps
+    # def test_div_floor_int(self):
+    #     def helper(shape, dtype):
+    #         cpu_x = torch.randint(-9999, -1,shape, device='cpu', dtype=dtype)
+    #         x = cpu_x.detach().clone().to('mps')
+
+    #         cpu_y = torch.randint(1, 9999, shape, device='cpu', dtype=dtype)
+    #         y = cpu_y.detach().clone().to('mps')
+
+    #         div_result = torch.div(x, y,rounding_mode='floor')
+    #         div_result_cpu = torch.div(cpu_x, cpu_y, rounding_mode='floor')
+    #         self.assertEqual(div_result, div_result_cpu)
+
+    #     helper((2, 8, 4, 5), torch.int16)
+    #     helper((2, 8, 4, 5), torch.int32)
 
     def test_median_int16(self):
         def helper(shape, dtype):
@@ -2633,6 +2420,23 @@ def helper(shape, dtype):
 
         helper((2, 8, 4, 5), torch.int16)
 
+    def test_cumsum_minus_one_axis(self):
+        def helper(dtype):
+            # Test with axis -1
+            cpu_x = None
+            if dtype == torch.float32:
+                cpu_x = torch.randn(10, 3, device='cpu', dtype=torch.float32)
+            else:
+                cpu_x = torch.randint(0, 20, (10, 3), device='cpu', dtype=torch.float32)
+            x = cpu_x.detach().clone().to('mps')
+
+            cpu_y = cpu_x.cumsum(-1)
+            y = x.cumsum(-1)
+
+            self.assertEqual(y, cpu_y)
+
+        [helper(dtype) for dtype in [torch.float32, torch.int16, torch.int32, torch.uint8]]
+
 class TestLogical(TestCaseMPS):
     def _wrap_tensor(self, x, device="cpu", dtype=None, requires_grad=False):
         return torch.tensor(x, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -2785,6 +2589,20 @@ def test_smooth_l1_loss_reduction_mean_sum_backward(self):
 
 
 class TestNLLLoss(TestCaseMPS):
+    def test_nll2d_loss_backward(self, device='mps'):
+        a = torch.randn(3, 5, requires_grad=True, device=device)
+        b = torch.tensor([1, 0, 4], device=device)
+        loss = nn.NLLLoss()
+        out = loss(a, b)
+        self.assertIsNone(out.grad_fn._saved_weight)
+        loss = nn.NLLLoss(weight=torch.ones((5,), device=device))
+        out = loss(a, b)
+        self.assertEqual(out.grad_fn._saved_weight, torch.ones((5,)))
+
+        out.sum().backward()
+        with self.assertRaisesRegex(RuntimeError, "after they have already been freed"):
+            out.grad_fn._saved_weight
+
     def test_nll_loss_mismatched_batch(self, device='mps'):
         x = torch.randn((10, 3), requires_grad=True, device=device)
         # t should have size (10,)
@@ -2846,13 +2664,13 @@ def _nll_loss_helper(self, input_size, reduction, expected):
         input = torch.rand(input_size, requires_grad=True, device='cpu')
         num_channels = input_size[1]
         target_size = (input_size[0], ) + tuple(input_size[2:])
-        target = torch.randint(num_channels, target_size, device='cpu')
         weights = torch.randn(num_channels)
+        weights_mps = weights.to("mps")
+        target = torch.randint(num_channels, target_size, device='cpu')
 
         # MPS
         input_mps = input.detach().clone().to('mps').requires_grad_()
         target_mps = target.detach().clone().to('mps')
-        weights_mps = weights.to("mps")
 
         output_cpu = F.nll_loss(input, target, weight=weights, reduction=reduction)
         output_mps = F.nll_loss(input_mps, target_mps, weight=weights_mps, reduction=reduction)
@@ -3389,6 +3207,7 @@ def test_eq(self):
 
         self.assertEqual(result_cpu, result_mps.to('cpu'))
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_signed_vs_unsigned_comparison(self):
         cpu_x = torch.tensor((-1, 2, 3), device='cpu', dtype=torch.uint8)
         mps_x = torch.tensor((-1, 2, 3), device='mps', dtype=torch.uint8)
@@ -4542,26 +4361,6 @@ def helper(shape):
         helper((5, 9, 7, 4))
         helper((50, 20, 7, 4))
 
-    def test_sort(self):
-        for SIZE in (4, 2049):
-            device = 'mps'
-            x = torch.rand(4, SIZE, device=device)
-            res1val, res1ind = torch.sort(x)
-
-            res2val = torch.tensor((), device=device)
-            res2ind = torch.tensor((), device=device, dtype=torch.long)
-            torch.sort(x, out=(res2val, res2ind))
-            self.assertEqual(res1val, res2val, atol=0, rtol=0)
-            self.assertEqual(res1ind, res2ind, atol=0, rtol=0)
-            self.assertEqual(torch.argsort(x), res1ind)
-            self.assertEqual(x.argsort(), res1ind)
-
-            self.assertEqual(
-                torch.sort(torch.tensor((50, 40, 30, 20, 10), device=device))[0],
-                torch.tensor((10, 20, 30, 40, 50), device=device),
-                atol=0, rtol=0
-            )
-
     def test_upsample_nearest2d(self):
         def helper(N, C, H, W):
             inputCPU = torch.arange(N * C * H * W, device='cpu', dtype=torch.float,
@@ -4627,6 +4426,7 @@ def helper(N, C, H, W):
         helper(1, 1, 4, 4)
         helper(7, 5, 3, 2)
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_interpolate(self):
         def helper(shape, output_size, scales, mode, align_corners=False):
             inputCPU = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
@@ -4776,6 +4576,8 @@ def helper(shape, padding, op, value=0):
         helper((2, 1, 6, 8), 2, nn.ReplicationPad2d)
         # verify if a change in shape of padding would cause problems with graph caching
         helper((2, 1, 6, 8), (2, 4, 3, 5), nn.ReplicationPad2d)
+        # negative padding
+        helper((1, 3, 4, 4), (-1, 1, -2, 1), nn.ReplicationPad2d)
         # Constant Pad 2D
         helper((2, 1, 6, 8), (2, 4, 3, 5), nn.ConstantPad2d)
         # input size < pad size
@@ -4795,15 +4597,10 @@ def helper(shape, padding, op, value=0):
         helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ReplicationPad3d)
         # Constant Pad 3D
         helper((2, 4, 6, 8, 4), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
-        # input size < pad size
-        helper((2, 4, 6), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
         # check the workaround for the right padding bug in Monterey
         helper((1, 2, 2, 2, 2), (0, 1), nn.ConstantPad3d)
-<<<<<<< HEAD
-=======
         # input size < pad size
         helper((2, 4, 6), (1, 3, 3, 5, 3, 4), nn.ConstantPad3d)
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
     # Test stack forward
     def test_stack(self):
@@ -5086,7 +4883,6 @@ def helper(shape, dim=0):
             for dim in range(len(shape)):
                 helper(shape, dim)
 
-<<<<<<< HEAD
     # Test softplus
     def test_softplus(self):
         def helper(shape, beta=1, threshold=20):
@@ -5110,31 +4906,8 @@ def helper(shape, beta=1, threshold=20):
             for beta in [0.5, 1, 2, 3, 4]:
                 for threshold in [0.5, 20, 30, 40, 50]:
                     helper(shape, beta, threshold)
-=======
-    # # Test softplus
-    # def test_softplus(self):
-    #     def helper(shape, beta=1, threshold=20):
-    #         cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=True)
-    #         x = cpu_x.detach().clone().to('mps').requires_grad_()
-
-    #         softplus_result = torch.nn.Softplus(beta=beta, threshold=threshold)(x)
-    #         softplus_result_cpu = torch.nn.Softplus(beta=beta, threshold=threshold)(cpu_x)
-
-    #         cpu_grad = torch.randn(softplus_result.shape)
-    #         grad = cpu_grad.to('mps')
 
-    #         softplus_result.backward(gradient=grad)
-    #         softplus_result_cpu.backward(gradient=cpu_grad)
-
-    #         self.assertEqual(softplus_result, softplus_result_cpu)
-    #         self.assertEqual(x.grad, cpu_x.grad)
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
-
-    #     # Test empty shape too
-    #     for shape in [(), (2, 3), (10, 10), (2, 3, 4, 5)]:
-    #         for beta in [0.5, 1, 2, 3, 4]:
-    #             for threshold in [0.5, 20, 30, 40, 50]:
-    #                 helper(shape, beta, threshold)
+    # Test silu
 
     def test_silu(self):
         def helper(shape):
@@ -5340,17 +5113,6 @@ def _gelu_ref(X):
         finally:
             torch.set_num_threads(num_threads)
 
-    def test_gelu_tanh(self):
-        def helper(shape):
-            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float)
-            x = cpu_x.detach().clone().to('mps')
-
-            gelu_tanh_result = torch.nn.functional.gelu(x, approximate='tanh')
-            gelu_tanh_result_cpu = torch.nn.functional.gelu(cpu_x, approximate='tanh')
-            self.assertEqual(gelu_tanh_result, gelu_tanh_result_cpu)
-
-        helper((2, 8, 4, 5))
-
     # Test hardtanh
     def test_hardtanh(self):
         def helper(shape, min_val, max_val, inplace=False):
@@ -5527,14 +5289,14 @@ def helper(shape):
 
     # Test index add
     def test_index_add(self):
-        def helper(shape, dim, index, source_shape, alpha, x_dtype=torch.float32, idx_dtype=torch.int32):
-            cpu_x = torch.randn(shape, device='cpu', dtype=x_dtype, requires_grad=False)
+        def helper(shape, dim, index, source_shape, alpha, idx_dtype=torch.int32):
+            cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
             x = cpu_x.detach().clone().to('mps')
 
             cpu_idx = torch.tensor(index, device='cpu', dtype=idx_dtype)
             idx = cpu_idx.detach().clone().to('mps')
 
-            cpu_source = torch.randn(source_shape, device='cpu', dtype=x_dtype, requires_grad=False)
+            cpu_source = torch.randn(source_shape, device='cpu', dtype=torch.float, requires_grad=False)
             source = cpu_source.detach().clone().to('mps')
 
             idx_result = torch.index_add(x, dim=dim, index=idx, source=source, alpha=alpha)
@@ -5550,8 +5312,6 @@ def helper(shape, dim, index, source_shape, alpha, x_dtype=torch.float32, idx_dt
         # test result dim=1
         helper((2,), 0, [1], (1,), 6.0)
         helper(2, 0, 1, 1, 6)
-        # test float16
-        helper((2,), 0, [1], (1,), 6.0, x_dtype=torch.float16)
 
     # Test flip
     def test_flip(self):
@@ -5595,23 +5355,6 @@ def helper(shape, dim, index, idx_dtype=torch.int32):
         helper((2, 8, 4, 5), 2, [3, 0, 1])
         helper((2, 8, 4, 5), 3, [2, 3, 0])
         helper((2, 3, 3), -1, [1, 2])
-        helper((), 0, [0])
-        helper((5), 0, [])
-
-    def test_index_select_scalar(self):
-        def helper(value, dim, index, idx_dtype=torch.int32):
-            cpu_x = torch.tensor(value, device='cpu', dtype=torch.float, requires_grad=False)
-            x = cpu_x.detach().clone().to('mps')
-
-            cpu_idx = torch.tensor(index, device='cpu', dtype=idx_dtype)
-            idx = cpu_idx.detach().clone().to('mps')
-
-            idx_result = torch.index_select(x, dim=dim, index=idx)
-            idx_result_cpu = torch.index_select(cpu_x, dim=dim, index=cpu_idx)
-
-            self.assertEqual(idx_result, idx_result_cpu)
-
-        helper(22, 0, [])
 
     def test_embedding_dense_backward(self):
         def helper(n, d, m, idx):
@@ -5828,11 +5571,7 @@ def helper(shape, dim, idx_shape, src_shape, idx_dtype=torch.int64, reduce_str="
 
         # for reduce in ["sum", "prod", "amax", "amin"]:
         for reduce_type in ["add", "multiply"]:
-<<<<<<< HEAD
             helper((2, 3), 0, (5, 3), (5, 3), reduce_str=reduce_type)
-=======
-            helper((2, 3), 0, (5, 3), (5, 3), reduce_str=reduce)
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
             helper((2, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce_type)
             helper((8, 8, 4, 5), 0, (10, 8, 4, 5), (10, 8, 4, 5), reduce_str=reduce_type)
             helper((8, 8, 4, 5), 0, (4, 7, 3, 2), (4, 7, 3, 2), reduce_str=reduce_type)
@@ -5994,13 +5733,6 @@ def test_arange_empty(self):
         y_cpu = torch.arange(0, 0, 1, out=out_cpu)
         self.assertEqual(y_mps, y_cpu)
 
-    # Test rgange
-    def test_range(self):
-        self.assertEqual(np.arange(11, dtype=np.float32), torch.range(0, 10, device='mps'))
-        self.assertEqual(np.arange(7, 0, -1, dtype=np.float32), torch.range(7, 1, -1, device='mps'))
-        self.assertEqual(np.array([1.0000, 1.3000, 1.6000, 1.9000], dtype=np.float32), torch.range(1, 2, .3, device='mps'))
-        self.assertEqual(np.arange(6.3, dtype=np.float32), torch.arange(0, 6.3, device='mps'))
-
     # Test softmax
     def test_softmax(self):
         def helper(shape, dim, channels_last=False):
@@ -6239,25 +5971,24 @@ def test_device_synchronize(self):
         torch.mps.synchronize()
 
     def test_mps_allocator_module(self):
-        # first garbage collect and empty the cached blocks
+        # limit memory allocations up to 1.5x of recommended maximum size from Metal API
+        torch.mps.set_per_process_memory_fraction(1.5)
+
+        # just running some ops to allocate buffers
+        net1 = torch.nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)\
+            .to(device='mps', dtype=torch.float)
+
+        x = torch.rand(1, 128, 6, 6, device='mps', dtype=torch.float, requires_grad=True)
+        x = net1(x)
+        print(f"current_allocated: {torch.mps.current_allocated_memory() / 1024} KB, "
+              f"driver_allocated: {torch.mps.driver_allocated_memory() / 1024} KB\n")
         gc.collect()
+        # running this test alone will not release any buffers as they are in use.
+        # however, running along with other tests should release the cached allocations.
         torch.mps.empty_cache()
-        # measure memory allocations from MPSAllocator
-        current_alloc_before = torch.mps.current_allocated_memory()
-        # after garbage collection and emptying the cache the
-        # current_allocated_memory must be zero
-        self.assertTrue(current_alloc_before == 0)
-        # measure total memory allocations from Metal driver
-        driver_alloc_before = torch.mps.driver_allocated_memory()
-        # allocate a new 8 MB tensor to force allocation of a new Metal Heap
-        x = torch.ones(1024 * 1024 * 8, device="mps")
-        # get memory allocations after allocating tensor x
-        current_alloc_after = torch.mps.current_allocated_memory()
-        driver_alloc_after = torch.mps.driver_allocated_memory()
-        # current and driver memory allocations must have
-        # grown at this point
-        self.assertTrue(current_alloc_after > current_alloc_before)
-        self.assertTrue(driver_alloc_after > driver_alloc_before)
+        x.backward(torch.randn_like(x))
+        print(f"current_allocated: {torch.mps.current_allocated_memory() / 1024} KB, "
+              f"driver_allocated: {torch.mps.driver_allocated_memory() / 1024} KB\n")
 
     # Test random_.to and random_.from
     def test_random(self):
@@ -6425,65 +6156,18 @@ def helper(probs, compare_mean, compare_var, num_samples=5, replacement=True):
         helper(np.array([1, 1, 1, 1, 1]), (0 + 1 + 2 + 3 + 4) / 5, (6 - 2 * 2), 10000)
         helper(np.array([[1, 1, 1, 1, 1, 1, 1]]), 0, 0, 7, False)
 
-    def test_cumsum_dim_check(self):
-        x = torch.rand((3, 3), device="mps")
-        self.assertEqual(x.cumsum(1), x.cumsum(-1))
-        self.assertEqual(x.cumsum(0), x.cumsum(-2))
-        self.assertRaises(IndexError, lambda: x.cumsum(2))
-        self.assertRaises(IndexError, lambda: x.cumsum(-3))
-
-
-class TestTopK(TestCase):
-    def _test_topk(self, shape, largest):
-        cpu_x = torch.randn(shape, device='cpu', dtype=torch.float, requires_grad=False)
-        x = cpu_x.detach().clone().to('mps')
-        if isinstance(shape, tuple):
-            for curr_dim, dim_size in enumerate(shape):
-                for k in range(1, dim_size + 1):
-                    topk_values, topk_indices = torch.topk(x, k, dim=curr_dim, largest=largest)
-                    topk_values_cpu, topk_indices_cpu = torch.topk(cpu_x, k, dim=curr_dim, largest=largest)
-                    self.assertEqual(topk_values, topk_values_cpu)
-                    self.assertEqual(topk_indices, topk_indices_cpu)
-        else:
-            for k in range(1, shape):
-                topk_values, topk_indices = torch.topk(x, k, dim=0, largest=largest)
-                topk_values_cpu, topk_indices_cpu = torch.topk(cpu_x, k, dim=0, largest=largest)
-                self.assertEqual(topk_values, topk_values_cpu)
-                self.assertEqual(topk_indices, topk_indices_cpu)
-
-    def test_topk(self):
-        largest_vals = [True, False]
-        shapes = [
-            # Zero Element Tensors
-            0,
-            (1, 0),
-            (0, 1),
-            (1, 0, 1),
-            # Multiple Element Tensors
-            1,
-            2,
-            (5, 1),
-            (1, 5),
-            (5, 9, 7, 4),
-        ]
-
-        for shape in shapes:
-            for largest_val in largest_vals:
-                with self.subTest(shape=shape, largest_val=largest_val):
-                    self._test_topk(shape, largest_val)
-
 class TestNNMPS(NNTestCase):
 
     def _create_basic_net(self):
         class Layer(nn.Module):
             def __init__(self):
-                super().__init__()
+                super(Layer, self).__init__()
                 self.layer_dummy_param = Parameter(torch.empty(3, 5))
                 self.register_buffer('layer_dummy_buf', torch.zeros(1, 3, 3, 7))
 
         class Net(nn.Module):
             def __init__(self):
-                super().__init__()
+                super(Net, self).__init__()
                 self.l1 = Layer()
                 self.dummy_param = Parameter(torch.empty(3, 5))
                 self.register_buffer('dummy_buf', torch.zeros(7, 3, 3, 1))
@@ -6571,27 +6255,24 @@ def test_zero_grad(self):
         self.assertIsNotNone(module.weight.grad)
         self.assertGreater(module.weight.grad.data.abs().sum(), 0)
         module.zero_grad()
-        self.assertIsNone(module.weight.grad)
+        self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
 
         module.bias.requires_grad = True
         module.zero_grad()
-        self.assertIsNone(module.weight.grad)
+        self.assertIsNotNone(module.weight.grad)
         self.assertIsNone(module.bias.grad)
         module(i).sum().backward()
         self.assertIsNotNone(module.weight.grad)
         self.assertIsNotNone(module.bias.grad)
         self.assertGreater(module.weight.grad.data.abs().sum(), 0)
         self.assertGreater(module.bias.grad.data.abs().sum(), 0)
-
-        # Force set to zeros.
-        module.zero_grad(set_to_none=False)
+        module.zero_grad()
         self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
         self.assertEqual(module.bias.grad.data, module.bias.data.clone().zero_())
 
-        module.zero_grad()
+        # Force set to None.
+        module.zero_grad(set_to_none=True)
         self.assertIsNone(module.weight.grad)
-        self.assertIsNone(module.bias.grad)
-
 
     def test_no_grad(self):
         for dtype in [torch.bfloat16, torch.float, torch.double]:
@@ -6706,33 +6387,6 @@ def attention2(key, *, workaround=False, device):
         r2_cpu = r2.to("cpu")
         self.assertEqual(r1, r2_cpu)
 
-    def test_group_norm_backward(self, device='mps'):
-        # See https://github.com/pytorch/pytorch/issues/88331 for more detail
-        shape = [1, 4, 16, 16]
-        x = torch.full(shape, 7.0, device=device)
-
-        target = torch.ones((1, 3, 128, 128), device=device)
-
-        conv_in = nn.Conv2d(4, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), device=device)
-        conv_out = nn.Conv2d(128, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), device=device)
-        norm = nn.GroupNorm(32, 128, eps=1e-6, affine=True, device=device)
-
-        with torch.enable_grad():
-            x = x.detach().requires_grad_()
-            out = 5.5 * x
-            out = conv_in(out)
-            out = out + norm(out)
-            out = out + norm(out)
-            out = out + norm(out)
-            out = F.interpolate(out, scale_factor=8.0, mode="nearest")
-            out = norm(out)
-            out = conv_out(out)
-
-            loss = (out - target).norm(dim=-1).sum()
-            grad = -torch.autograd.grad(loss, x)[0]
-            self.assertFalse(grad.detach().isnan().any().item(), 'NaN gradients returned by autograd')
-
-
     # def test_conv2d_same_padding(self, device='mps'):
         # x = torch.rand(1, 1, 10, 11, device=device)
         # y = torch.rand(1, 1, 4, 5, device=device)
@@ -7547,10 +7201,12 @@ def test_T(self, device="mps"):
         self.assertEqual(t2, t1)
         b = torch.randn(10, device=device)
         self.assertEqual(b, b.T)
+        scalar = torch.tensor(5, device=device)
+        self.assertEqual(scalar, scalar.T)
 
     def test_transposes(self, device="mps", dtype=torch.float32):
         for op in ("T", "H", "mT", "mH", "adjoint"):
-            shapes = ((2, 3), (2, 3, 4)) if op[0] == "m" or op == "adjoint" else ((2, 3),)
+            shapes = ((), (2, 3), (2, 3, 4)) if op[0] == "m" or op == "adjoint" else ((), (2, 3),)
             for shape in shapes:
                 a = make_tensor(shape, device=device, dtype=dtype)
                 t1 = getattr(a, op)
@@ -8407,6 +8063,7 @@ def test_bool_indices(self, device="mps"):
             self.assertEqual(v[boolIndices], torch.tensor([True], dtype=torch.bool, device=device))
             self.assertEqual(len(w), 2)
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_bool_indices_accumulate(self, device="mps"):
         mask = torch.zeros(size=(10, ), dtype=torch.uint8, device=device)
         mask = mask > 0
@@ -8597,6 +8254,7 @@ def helper(device, dtype):
             self.assertEqual(res.shape, src.shape)
         [helper(device="mps", dtype=dtype) for dtype in [torch.float, torch.int32]]
 
+    @unittest.skipIf(product_version < 13.0, "Skipped on macOS 12")
     def test_index_src_datatype(self):
         def helper(device, dtype):
             orig_dtype = dtype
@@ -9078,73 +8736,134 @@ def test_serialization_map_location(self):
 for t in [torch.double, torch.cdouble, torch.cfloat, torch.int8, torch.bfloat16]:
     del MPS_DTYPES[MPS_DTYPES.index(t)]
 
+abbrs_to_torch_dtype_dict = {value : key for (key, value) in dtype_abbrs.items()}
+class UnitTestSample:
+    def __init__(self, dtype, args, params, out):
+        requires_grad = (dtype.is_floating_point or dtype.is_complex)
+        self.args_ = [t.detach().to('mps').requires_grad_(requires_grad) for t in args]
+        self.params_ = params
+        self.out_ = out
+
+    def sample(self):
+        return self.args_ + self.params_
+
+    def expected(self):
+        return tuple(self.out_)
+
+CUDA_RESULT = dict()
+OP_UNIT_TEST = dict()
+dirname = os.path.dirname(__file__)
+filename = os.path.join(dirname, "cuda_results.yaml")
+with open(filename) as f:
+    data = yaml.safe_load(f)
+    for key, value in data['ConsistencyTest'].items():
+        CUDA_RESULT[key] = torch.as_tensor(value)
+    for key, samples in data['UnitTest'].items():
+        unit_tests = []
+        for sample in samples:
+            dtype = abbrs_to_torch_dtype_dict[sample['dtype']]
+            args = [torch.as_tensor(arg).to(dtype) for arg in sample['args']]
+            params = sample['params']
+            out = [torch.as_tensor(res).to(dtype) for res in sample['res']]
+            unit_tests.append(UnitTestSample(dtype, args, params, out))
+        OP_UNIT_TEST[key] = unit_tests
 
 class TestConsistency(TestCaseMPS):
+
     # TODO: This is only used while some ops are being added.
     # This list should contain all ops and dtypes eventually
     # This can be generated automatically in the `new_mps_allowlist.txt` file
     # by doing `EXPECTTEST_ACCEPT=1 python test_mps.py TestConsistencyCPU`
     # You most likely do NOT want to modify this manually
     ALLOWLIST_OP = {
+        'H': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'T': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__getitem__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__radd__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__rand__': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        '__rdiv__': ['f16', 'f32', 'i16', 'i32', 'u8'],
-        '__rmatmul__': ['f32'],
+        '__rdiv__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        '__rmatmul__': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        '__rmod__': ['f16', 'f32'],
         '__rmul__': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__ror__': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        '__rpow__': ['f16'],
+        '__rpow__': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        '__rsub__': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         '__rxor__': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        'masked.argmax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.log_softmax': ['f32'],
-        'masked.logaddexp': ['f32'],
-        'masked.logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.norm': ['f16', 'f32'],
-        'masked.normalize': ['f16', 'f32'],
-        'masked.softmax': ['f32'],
-        'masked.softmin': ['f32'],
-        'masked.std': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.var': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'abs': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'acos': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'acosh': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        '_native_batch_norm_legit': ['f32'],
+        '_softmax_backward_data': ['f32'],
+        'abs': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'acos': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'acosh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'addbmm': ['f32'],
+        'addbmm': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'addcdiv': ['f32'],
         'addcmul': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'addmm': ['f32'],
-        'addmv': ['f32'],
-        'addr': ['f32'],
+        'addmm': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'addmv': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'addr': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'all': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'allclose': ['f16', 'f32'],
+        'amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'aminmax': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'angle': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'any': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'arange': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'argmax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'amix': ['f32'],
-        'asin': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'asinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'atan': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'atan2': ['f32'],
-        'atanh': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'argsort': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'argwhere': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'as_strided': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'as_strided_scatter': ['b8',
+                               'f16',
+                               'f32',
+                               'i16',
+                               'i32',
+                               'i64',
+                               'u8'],
+        'asin': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'asinh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'atan': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'atan2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'atanh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'atleast_1d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'atleast_2d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'atleast_3d': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'baddbmm': ['f32'],
+        'baddbmm': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'bernoulli': ['f32'],
+        'bfloat16': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'bincount': ['i16', 'i32', 'i64', 'u8'],
         'bitwise_and': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'bitwise_left_shift': ['i16', 'i32', 'i64', 'u8'],
         'bitwise_not': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'bitwise_or': ['b8', 'i16', 'i32', 'i64', 'u8'],
         'bitwise_right_shift': ['i16', 'i32', 'i64', 'u8'],
         'bitwise_xor': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        'block_diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
-        'bmm': ['f32'],
+        'block_diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'bmm': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'bool': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'broadcast_shapes': ['f32'],
+        'broadcast_tensors': ['b8',
+                              'f16',
+                              'f32',
+                              'i16',
+                              'i32',
+                              'i64',
+                              'u8'],
+        'broadcast_to': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'bucketize': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'byte': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cartesian_prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'cat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'ceil': ['f32', 'int32', 'int64', 'f16'],
+        'cdist': ['f32'],
+        'cdouble': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'ceil': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cfloat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'chalf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'char': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cholesky': ['f32'],
+        'cholesky_inverse': ['f32'],
+        'cholesky_solve': ['f32'],
         'chunk': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'clamp': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'clamp_max': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
@@ -9152,241 +8871,659 @@ class TestConsistency(TestCaseMPS):
         'clone': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'column_stack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'combinations': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'complex': ['f16', 'f32'],
         'conj': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'conj_physical': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'constant_pad_nd': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'contiguous': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'corrcoef': ['f32'],
-        'cos': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'],
-        'cosh': ['b8', 'f32', 'i16', 'i32', 'u8', 'i64'],
-        'cov': ['f32'],
-        'cumsum': ['f16', 'f32', 'int16', 'int32'],
+        'copysign': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'corrcoef': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cos': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cosh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'count_nonzero': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cov': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cross': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cummax': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cummin': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cumprod': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'cumsum': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'deg2rad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'diag': ['f32', 'i32'],
-        'diag_embed': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
-        'diagflat': ['f32', 'i32'],
-        'diagonal_scatter': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
+        'diag': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diag_embed': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diagflat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diagonal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diagonal_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'diagonal_scatter': ['b8',
+                             'f16',
+                             'f32',
+                             'i16',
+                             'i32',
+                             'i64',
+                             'u8'],
         'diff': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'dist': ['f32'],
+        'digamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'dist': ['f16', 'f32'],
+        'div': ['f16', 'f32', 'u8', 'b8', 'i16', 'i32', 'i64'],
         'dot': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'einsum': ['f32'],
+        'double': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'dsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'dstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'einsum': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'empty': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'empty_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'eq': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'equal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'erf': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'exp': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'exp2': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
+        'erf': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'erfc': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'erfinv': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'exp': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'exp2': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'expand': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'expand_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'expm1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'eye': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.fft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.fft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.fftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.fftshift': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.hfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.hfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.hfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ifft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ifft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ifftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ifftshift': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ihfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ihfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.ihfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.irfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.irfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.irfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.rfft': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.rfft2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fft.rfftn': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'flatten': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'flip': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'fliplr': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'flipud': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'flip': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fliplr': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'flipud': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'float': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'floor': ['f32', 'f16', 'i16', 'i32', 'i64'],
-        'floor_divide': ['f32', 'f16'],
-        'fmod': ['f32', 'f16', 'i16', 'i32', 'i64', 'u8'],
+        'float_power': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'floor': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'floor_divide': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fmax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fmin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'fmod': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'frac': ['f16', 'f32'],
+        'frexp': ['f16', 'f32'],
+        'full': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'full_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'gather': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'gradient': ['f16', 'f32', 'i16'],
+        'gcd': ['i16', 'i32', 'i64', 'u8'],
         'ge': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'geqrf': ['f32'],
+        'gradient': ['f16', 'f32', 'i16', 'i32', 'i64'],
+        'grid_sampler_2d': ['f32'],
         'gt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'half': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'heaviside': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'histc': ['f32'],
+        'histogram': ['f32'],
+        'histogramdd': ['f32'],
+        'hsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'hstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'hypot': ['f32'],
+        'i0': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'igamma': ['f16', 'f32'],
+        'igammac': ['f16', 'f32'],
+        'index_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'index_fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'index_put': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'index_reduce': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'index_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'index_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'inner': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'int': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isclose': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isfinite': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'isin': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'isinf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isnan': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'isneginf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'isposinf': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'isreal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'kron': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'linalg.matrix_norm': ['f16'],
+        'kthvalue': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'lcm': ['i16', 'i32', 'i64', 'u8'],
+        'ldexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'le': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'lerp': ['f32'],
+        'lgamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.cholesky': ['f32'],
+        'linalg.cholesky_ex': ['f32'],
+        'linalg.cond': ['f32'],
+        'linalg.cross': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.det': ['f32'],
+        'linalg.eig': ['f32'],
+        'linalg.eigh': ['f32'],
+        'linalg.eigvals': ['f32'],
+        'linalg.eigvalsh': ['f32'],
+        'linalg.householder_product': ['f32'],
+        'linalg.inv': ['f32'],
+        'linalg.inv_ex': ['f32'],
+        'linalg.ldl_factor': ['f32'],
+        'linalg.ldl_factor_ex': ['f32'],
+        'linalg.ldl_solve': ['f32'],
+        'linalg.lstsq': ['f32'],
+        'linalg.lu': ['f32'],
+        'linalg.lu_factor': ['f32'],
+        'linalg.lu_factor_ex': ['f32'],
+        'linalg.lu_solve': ['f32'],
+        'linalg.matrix_norm': ['f16', 'f32'],
+        'linalg.matrix_power': ['f32'],
+        'linalg.matrix_rank': ['f32'],
+        'linalg.multi_dot': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.norm': ['f16', 'f32'],
+        'linalg.pinv': ['f32'],
+        'linalg.qr': ['f32'],
+        'linalg.slogdet': ['f32'],
+        'linalg.solve': ['f32'],
+        'linalg.solve_ex': ['f32'],
+        'linalg.solve_triangular': ['f32'],
         'linalg.svd': ['f32'],
+        'linalg.svdvals': ['f32'],
+        'linalg.tensorinv': ['f32'],
+        'linalg.tensorsolve': ['f32'],
+        'linalg.vander': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'linalg.vecdot': ['f32'],
         'linalg.vector_norm': ['f16', 'f32'],
         'linspace': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'log': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'log10': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'log1p': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'log2': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'log_softmax': ['f32'],
-        'logaddexp': ['f16', 'f32'],
-        'logaddexp2': ['f16', 'f32'],
+        'log': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'log10': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'log1p': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'log2': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'log_softmax': ['f32', 'b8', 'f16', 'i16', 'i32', 'i64', 'u8'],
+        'logaddexp': ['f32'],
+        'logaddexp2': ['f32'],
+        'logcumsumexp': ['f32'],
+        'logdet': ['f32'],
         'logical_and': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logical_not': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logical_or': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logical_xor': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'logit': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'logspace': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'logsumexp': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'logsumexp': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'long': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'lt': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'lu': ['f32'],
+        'lu_solve': ['f32'],
+        'lu_unpack': ['f32'],
+        'mH': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'mT': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.amax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.amin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.argmax': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.argmin': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.cumprod': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.cumsum': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.log_softmax': ['f32'],
+        'masked.logaddexp': ['f32'],
+        'masked.logsumexp': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.mean': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.median': ['f32'],
+        'masked.norm': ['f16', 'f32'],
+        'masked.normalize': ['f16', 'f32'],
+        'masked.prod': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.softmax': ['f32'],
+        'masked.softmin': ['f32'],
+        'masked.std': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked.var': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked_fill': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'masked_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'masked_select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'matmul': ['f32'],
-        'mm': ['f32'],
-        'mv': ['f32'],
+        'matmul': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'matrix_exp': ['f32'],
+        'max': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'max_pool2d_with_indices_backward': ['f32'],
+        'maximum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'mean': ['f16', 'f32'],
+        'median': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'meshgrid': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'min': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'minimum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'mm': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'mode': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'movedim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'msort': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'mul': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'multinomial': ['f32'],
+        'mv': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'mvlgamma': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nan_to_num': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64'],
-        'nn.functional.adaptive_max_pool1d': ['f32'],
-        'nn.functional.adaptive_max_pool2d': ['f32'],
+        'nanmean': ['f16', 'f32'],
+        'nanmedian': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'nanquantile': ['f32'],
+        'nansum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'narrow': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'narrow_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'native_batch_norm': ['f32'],
+        'native_dropout_backward': ['b8',
+                                    'f16',
+                                    'f32',
+                                    'i16',
+                                    'i32',
+                                    'i64',
+                                    'u8'],
+        'native_layer_norm': ['f32'],
+        'ne': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'neg': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'new_empty': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'new_empty_strided': ['b8',
+                              'f16',
+                              'f32',
+                              'i16',
+                              'i32',
+                              'i64',
+                              'u8'],
+        'new_full': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'new_ones': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'new_zeros': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nextafter': ['f32'],
+        'nn.functional._scaled_dot_product_attention': ['f32'],
         'nn.functional.adaptive_avg_pool1d': ['f32'],
         'nn.functional.adaptive_avg_pool2d': ['f32'],
+        'nn.functional.adaptive_avg_pool3d': ['f16', 'f32'],
+        'nn.functional.adaptive_max_pool1d': ['f32'],
+        'nn.functional.adaptive_max_pool2d': ['f32'],
+        'nn.functional.adaptive_max_pool3d': ['f32'],
+        'nn.functional.alpha_dropout': ['f32'],
         'nn.functional.avg_pool1d': ['f32', 'i64'],
         'nn.functional.avg_pool2d': ['f32', 'i64'],
+        'nn.functional.avg_pool3d': ['f32', 'i64'],
+        'nn.functional.batch_norm': ['f32'],
+        'nn.functional.bilinear': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.binary_cross_entropy': ['f32'],
         'nn.functional.binary_cross_entropy_with_logits': ['f32'],
         'nn.functional.celu': ['f32'],
         'nn.functional.conv1d': ['f32'],
         'nn.functional.conv2d': ['f32'],
         'nn.functional.conv_transpose1d': ['f32'],
-        'nn.functional.cosine_embedding_loss': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.conv_transpose2d': ['f32'],
+        'nn.functional.cosine_embedding_loss': ['b8',
+                                                'f32',
+                                                'i16',
+                                                'i32',
+                                                'i64',
+                                                'u8'],
         'nn.functional.cosine_similarity': ['f32'],
+        'nn.functional.cross_entropy': ['f32'],
+        'nn.functional.ctc_loss': ['f32'],
+        'nn.functional.dropout': ['f32'],
+        'nn.functional.dropout2d': ['f32'],
+        'nn.functional.dropout3d': ['f32'],
         'nn.functional.elu': ['f32'],
-        'nn.functional.feature_alpha_dropout': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.embedding': ['f16', 'f32'],
+        'nn.functional.embedding_bag': ['f16', 'f32'],
+        'nn.functional.feature_alpha_dropout': ['b8',
+                                                'f16',
+                                                'f32',
+                                                'i16',
+                                                'i32',
+                                                'i64',
+                                                'u8'],
+        'nn.functional.fractional_max_pool2d': ['f32'],
+        'nn.functional.fractional_max_pool3d': ['f32'],
         'nn.functional.gaussian_nll_loss': ['f32'],
+        'nn.functional.gelu': ['f32'],
         'nn.functional.glu': ['f32'],
+        'nn.functional.grid_sample': ['f32'],
         'nn.functional.group_norm': ['f32'],
+        'nn.functional.hardshrink': ['f32'],
+        'nn.functional.hardsigmoid': ['f32'],
+        'nn.functional.hardswish': ['f32'],
         'nn.functional.hardtanh': ['f32', 'i16', 'i32', 'i64'],
         'nn.functional.hinge_embedding_loss': ['f32'],
         'nn.functional.huber_loss': ['f16', 'f32'],
         'nn.functional.instance_norm': ['f32'],
-        'nn.functional.kl_div': ['f32', 'i16', 'i32', 'i64'],
+        'nn.functional.interpolate': ['f32', 'u8'],
+        'nn.functional.kl_div': ['f32'],
         'nn.functional.l1_loss': ['f16', 'f32'],
+        'nn.functional.layer_norm': ['f32'],
         'nn.functional.leaky_relu': ['f32'],
-        'nn.functional.linear': ['f32'],
-        'nn.functional.local_response_norm': ['f32'],
-        'nn.functional.margin_ranking_loss': ['f32', 'i16', 'i32'],
+        'nn.functional.linear': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.local_response_norm': ['f32', 'i64'],
+        'nn.functional.logsigmoid': ['f32'],
+        'nn.functional.margin_ranking_loss': ['f32',
+                                              'i16',
+                                              'i32',
+                                              'i64',
+                                              'u8'],
         'nn.functional.max_pool1d': ['f32'],
         'nn.functional.max_pool2d': ['f32'],
-        'max_pool2d_with_indices_backward': ['f32'],
+        'nn.functional.max_pool3d': ['f32'],
+        'nn.functional.max_unpool1d': ['f32'],
+        'nn.functional.max_unpool2d': ['f32'],
+        'nn.functional.max_unpool3d': ['f32'],
+        'nn.functional.mish': ['f32'],
         'nn.functional.mse_loss': ['f16', 'f32'],
+        'nn.functional.multi_margin_loss': ['f32'],
+        'nn.functional.multilabel_margin_loss': ['f32'],
+        'nn.functional.multilabel_soft_margin_loss': ['f32'],
         'nn.functional.nll_loss': ['f32'],
-        'nn.functional.pad': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.padconstant': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.padreflect': ['f32'],
-        'nn.functional.padreplicate': ['f32'],
-        'nn.functional.pairwise_distance': ['f16', 'f32', 'i16', 'i32', 'i64'],
-        'nn.functional.poisson_nll_loss': ['f32', 'i16', 'i32', 'u8'],
+        'nn.functional.normalize': ['f32'],
+        'nn.functional.one_hot': ['i64'],
+        'nn.functional.pad': ['b8',
+                              'f16',
+                              'f32',
+                              'i16',
+                              'i32',
+                              'i64',
+                              'u8'],
+        'nn.functional.pairwise_distance': ['f16',
+                                            'f32',
+                                            'i16',
+                                            'i32',
+                                            'i64',
+                                            'u8'],
+        'nn.functional.pdist': ['f32'],
+        'nn.functional.pixel_shuffle': ['b8',
+                                        'f16',
+                                        'f32',
+                                        'i16',
+                                        'i32',
+                                        'i64',
+                                        'u8'],
+        'nn.functional.pixel_unshuffle': ['b8',
+                                          'f16',
+                                          'f32',
+                                          'i16',
+                                          'i32',
+                                          'i64',
+                                          'u8'],
+        'nn.functional.poisson_nll_loss': ['f32',
+                                           'i16',
+                                           'i32',
+                                           'i64',
+                                           'u8'],
         'nn.functional.prelu': ['f32'],
         'nn.functional.relu': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.relu6': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.rrelu': ['f32'],
         'nn.functional.selu': ['f32'],
         'nn.functional.silu': ['f32'],
         'nn.functional.smooth_l1_loss': ['f16', 'f32'],
         'nn.functional.soft_margin_loss': ['f32'],
-        'nn.functional.softmin': ['f32'],
-        'nn.functional.softplus': ['f32'],
-        'nn.functional.softsign': ['f16', 'f32', 'i16', 'u8'],
-        'nn.functional.tanhshrink': ['f32', 'i16', 'i32', 'u8'],
+        'nn.functional.softmin': ['f32', 'f16', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.softshrink': ['f32'],
+        'nn.functional.softsign': ['f16',
+                                   'f32',
+                                   'i16',
+                                   'i32',
+                                   'i64',
+                                   'u8'],
+        'nn.functional.tanhshrink': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'nn.functional.threshold': ['f32', 'i16', 'i32', 'i64', 'u8'],
-        'nn.functional.triplet_margin_loss': ['f32', 'i16', 'i32', 'i64'],
-        'nn.functional.triplet_margin_with_distance_loss': ['f32', 'i16', 'i32', 'i64'],
+        'nn.functional.triplet_margin_loss': ['f32',
+                                              'i16',
+                                              'i32',
+                                              'i64',
+                                              'u8'],
+        'nn.functional.triplet_margin_with_distance_loss': ['f32',
+                                                            'i16',
+                                                            'i32',
+                                                            'i64',
+                                                            'u8'],
+        'nn.functional.unfold': ['f16', 'f32'],
         'nn.functional.upsample_bilinear': ['f32'],
-        'nn.functional.upsample_nearest': ['f32'],
+        'nn.functional.upsample_nearest': ['f32', 'u8'],
+        'nonzero': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
         'norm': ['f32', 'f16'],
+        'normal': ['f16', 'f32'],
+        'ones': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'ones_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'ormqr': ['f32'],
+        'outer': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'pca_lowrank': ['f32'],
+        'permute': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'pinverse': ['f32'],
+        'polar': ['f32'],
+        'polygamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'positive': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'pow': ['f16'],
+        'pow': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'prod': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'put': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'qr': ['f32'],
+        'quantile': ['f32'],
         'rad2deg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'rand_like': ['f16', 'f32'],
+        'randint': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'randint_like': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'randn': ['f16', 'f32'],
+        'randn_like': ['f16', 'f32'],
+        'ravel': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'real': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'reciprocal': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'remainder' : ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'reciprocal': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'remainder': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'renorm': ['f16', 'f32'],
         'repeat': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'repeat_interleave': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'resize_': ['b8', 'i16', 'i32', 'i64', 'u8'],
-        'resize_as_': ['b8', 'i16', 'i32', 'i64', 'u8'],
+        'repeat_interleave': ['b8',
+                              'f16',
+                              'f32',
+                              'i16',
+                              'i32',
+                              'i64',
+                              'u8'],
+        'reshape': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'reshape_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'resize_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'resize_as_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'resolve_conj': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'resolve_neg': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'roll': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'rot90': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'round': ['f32', 'f16', 'i16', 'i32', 'i64'],
-        'rsqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'round': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'rsqrt': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'rsub': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'scalar_tensor': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'scatter_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'select_scatter': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
+        'scatter_reduce': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'searchsorted': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'segment_reduce': ['f16', 'f32'],
+        'select': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'select_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sgn': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'short': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'sigmoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8'],
-        'sign': ['b8', 'f16', 'f32', 'i16', 'i32', 'u8', 'i64'],
-        'sin': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'sinh': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'slice_scatter': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
-        'softmax': ['f32'],
+        'sigmoid': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sign': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'signal.windows.bartlett': ['f16', 'f32'],
+        'signal.windows.blackman': ['f16', 'f32'],
+        'signal.windows.cosine': ['f16', 'f32'],
+        'signal.windows.exponential': ['f16', 'f32'],
+        'signal.windows.gaussian': ['f16', 'f32'],
+        'signal.windows.general_cosine': ['f16', 'f32'],
+        'signal.windows.general_hamming': ['f16', 'f32'],
+        'signal.windows.hamming': ['f16', 'f32'],
+        'signal.windows.hann': ['f16', 'f32'],
+        'signal.windows.kaiser': ['f16', 'f32'],
+        'signbit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sin': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sinc': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sinh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'slice': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'slice_scatter': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'softmax': ['f32', 'b8', 'f16', 'i16', 'i32', 'i64', 'u8'],
+        'sort': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.airy_ai': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.bessel_j0': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.bessel_j1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.bessel_y0': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.bessel_y1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.chebyshev_polynomial_t': ['b8',
+                                           'f32',
+                                           'i16',
+                                           'i32',
+                                           'i64',
+                                           'u8'],
+        'special.chebyshev_polynomial_u': ['b8',
+                                           'f32',
+                                           'i16',
+                                           'i32',
+                                           'i64',
+                                           'u8'],
+        'special.entr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.erfcx': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.hermite_polynomial_h': ['b8',
+                                         'f32',
+                                         'i16',
+                                         'i32',
+                                         'i64',
+                                         'u8'],
+        'special.hermite_polynomial_he': ['b8',
+                                          'f32',
+                                          'i16',
+                                          'i32',
+                                          'i64',
+                                          'u8'],
+        'special.i0e': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.i1': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.i1e': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.laguerre_polynomial_l': ['b8',
+                                          'f32',
+                                          'i16',
+                                          'i32',
+                                          'i64',
+                                          'u8'],
+        'special.log_ndtr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.modified_bessel_i0': ['b8',
+                                       'f32',
+                                       'i16',
+                                       'i32',
+                                       'i64',
+                                       'u8'],
+        'special.modified_bessel_i1': ['b8',
+                                       'f32',
+                                       'i16',
+                                       'i32',
+                                       'i64',
+                                       'u8'],
+        'special.modified_bessel_k0': ['b8',
+                                       'f32',
+                                       'i16',
+                                       'i32',
+                                       'i64',
+                                       'u8'],
+        'special.modified_bessel_k1': ['b8',
+                                       'f32',
+                                       'i16',
+                                       'i32',
+                                       'i64',
+                                       'u8'],
         'special.ndtr': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.ndtri': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.polygamma': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.scaled_modified_bessel_k0': ['b8',
+                                              'f32',
+                                              'i16',
+                                              'i32',
+                                              'i64',
+                                              'u8'],
+        'special.scaled_modified_bessel_k1': ['b8',
+                                              'f32',
+                                              'i16',
+                                              'i32',
+                                              'i64',
+                                              'u8'],
+        'special.spherical_bessel_j0': ['b8',
+                                        'f32',
+                                        'i16',
+                                        'i32',
+                                        'i64',
+                                        'u8'],
+        'special.xlog1py': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'special.zeta': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'sqrt': ['b8', 'f32', 'i16', 'i32', 'u8'],
+        'split_with_sizes': ['b8',
+                             'f16',
+                             'f32',
+                             'i16',
+                             'i32',
+                             'i64',
+                             'u8'],
+        'sqrt': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'square': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'squeeze': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'stack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'std': ['f16', 'f32'],
+        'std_mean': ['f16', 'f32'],
+        'stft': ['f32'],
         'sub': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'sum_to_size': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'svd': ['f32'],
+        'svd_lowrank': ['f32'],
+        'symeig': ['f32'],
         't': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'tan': ['b8', 'i16', 'i32', 'u8'],
-        'tanh': ['b8', 'f32', 'i16', 'i32', 'u8'],
-        'tensordot': ['f32'],
+        'take': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'take_along_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'tan': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'tanh': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'tensor_split': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'tile': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'topk': ['f32', 'f16'],
-        'trapz': ['f16', 'f32', 'i16', 'i32', 'i64'],
-        'sort': ['f32', 'i16', 'i32', 'i64'],
-        'argsort': ['f32', 'i16', 'i32', 'i64'],
+        'tensordot': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'tile': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'to_sparse': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'topk': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'trace': ['f32', 'i16', 'i32', 'i64', 'u8'],
+        'transpose': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'trapezoid': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'cumulative_trapezoid': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'trapz': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'triangular_solve': ['f32'],
         'tril': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'tril_indices': ['i32', 'i64'],
         'triu': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'triu_indices': ['i32', 'i64'],
         'true_divide': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'trunc': ['f32'],
+        'trunc': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'unbind': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'unflatten': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'unfold': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'unfold_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'uniform': ['f16', 'f32'],
+        'unique_consecutive': ['b8', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'unsqueeze': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'var': ['f16', 'f32'],
+        'var_mean': ['f16', 'f32'],
+        'vdot': ['f32', 'i16', 'i32', 'i64', 'u8'],
         'view': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'view_as': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'view_as_complex': ['f16', 'f32'],
+        'view_copy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'vsplit': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'vstack': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'zero_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
         'where': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'nonzero': ['b8', 'u8', 'f16', 'f32', 'i16', 'i32', 'i64'],
-        'cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'linalg.cross': ['f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'unique_consecutive': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'std': ['f16', 'f32'],
-        'var': ['f16', 'f32'],
-        'amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'mean': ['f16', 'f32'],
-        'count_nonzero': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.amax': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.amin': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.mean': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.prod': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'masked.sum': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'native_layer_norm': ['torch.float32'],
-        'nn.functional.layer_norm': ['torch.float32'],
-        'nn.functional.bilinear': ['f32'],
-        'linalg.solve_triangular': ['f32'],
-        'triangular_solve': ['f32'],
-        '_native_batch_norm_legit': ['f32'],
-        'native_batch_norm': ['f32'],
-        'minreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'maxreduction_with_dim': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'linalg.inv': ['f32'],
-        'linalg.inv_ex': ['f32'],
-        'mH': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'mT': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'T': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
-        'H': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'xlogy': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'zero_': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'zeros': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'zeros_like': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'index_add': ['b8', 'f16', 'f32', 'i16', 'i32', 'i64', 'u8'],
+        'nn.functional.softplus': ['f32'],
     }
 
-
     ALLOWLIST_OP_GRAD = {
+        'H': ['f16', 'f32'],
+        'T': ['f16', 'f32'],
+        '__getitem__': ['f16', 'f32'],
         '__radd__': ['f16', 'f32'],
         '__rdiv__': ['f16', 'f32'],
         '__rmatmul__': ['f32'],
+        '__rmod__': ['f16', 'f32'],
         '__rmul__': ['f16', 'f32'],
-        'masked.log_softmax': ['f32'],
-        'masked.logaddexp': ['f32'],
-        'masked.softmax': ['f32'],
-        'masked.softmin': ['f32'],
-        'masked.std': ['f32'],
+        '__rpow__': ['f32'],
+        '__rsub__': ['f16', 'f32'],
+        '_native_batch_norm_legit': ['f32'],
+        '_softmax_backward_data': ['f32'],
         'abs': ['f16', 'f32'],
         'acos': ['f32'],
         'acosh': ['f32'],
@@ -9398,168 +9535,524 @@ class TestConsistency(TestCaseMPS):
         'addmv': ['f32'],
         'addr': ['f32'],
         'all': ['f16', 'f32'],
+        'amax': ['f16', 'f32'],
+        'amin': ['f16', 'f32'],
+        'angle': ['f16', 'f32'],
         'any': ['f16', 'f32'],
         'arange': ['f16', 'f32'],
         'argmax': ['f16', 'f32'],
         'argmin': ['f16', 'f32'],
+        'argsort': ['f16', 'f32'],
+        'argwhere': ['f16', 'f32'],
+        'as_strided': ['f16', 'f32'],
+        'as_strided_scatter': ['f16', 'f32'],
         'asin': ['f32'],
         'asinh': ['f32'],
         'atan': ['f32'],
         'atan2': ['f32'],
+        'atanh': ['f32'],
         'atleast_1d': ['f16', 'f32'],
         'atleast_2d': ['f16', 'f32'],
         'atleast_3d': ['f16', 'f32'],
         'baddbmm': ['f32'],
+        'bernoulli': ['f32'],
+        'bfloat16': ['f16', 'f32'],
         'block_diag': ['f16', 'f32'],
         'bmm': ['f32'],
+        'bool': ['f16', 'f32'],
         'broadcast_shapes': ['f32'],
+        'broadcast_tensors': ['f16', 'f32'],
+        'broadcast_to': ['f16', 'f32'],
+        'bucketize': ['f16', 'f32'],
+        'byte': ['f16', 'f32'],
+        'cartesian_prod': ['f16', 'f32'],
+        'cat': ['f16', 'f32'],
+        'cdist': ['f32'],
         'ceil': ['f32'],
+        'char': ['f16', 'f32'],
+        'cholesky': ['f32'],
+        'cholesky_inverse': ['f32'],
+        'cholesky_solve': ['f32'],
         'chunk': ['f16', 'f32'],
+        'clamp': ['f32'],
+        'clamp_max': ['f16', 'f32'],
+        'clamp_min': ['f16', 'f32'],
         'clone': ['f16', 'f32'],
         'column_stack': ['f16', 'f32'],
+        'combinations': ['f16', 'f32'],
         'conj': ['f16', 'f32'],
         'conj_physical': ['f16', 'f32'],
+        'constant_pad_nd': ['f16', 'f32'],
         'contiguous': ['f16', 'f32'],
+        'copysign': ['f16', 'f32'],
         'corrcoef': ['f32'],
         'cos': ['f32'],
         'cosh': ['f32'],
-        'cumsum': ['f16', 'f32'],
+        'count_nonzero': ['f16', 'f32'],
+        'cov': ['f32'],
+        'cross': ['f32'],
+        'cummax': ['f32'],
+        'cummin': ['f32'],
+        'cumprod': ['f32'],
+        'cumsum': ['f32'],
+        'cumulative_trapezoid': ['f32'],
         'deg2rad': ['f16', 'f32'],
-        'diag': ['f32'],
+        'diag': ['f16', 'f32'],
         'diag_embed': ['f16', 'f32'],
-        'diagflat': ['f32'],
+        'diagflat': ['f16', 'f32'],
+        'diagonal': ['f16', 'f32'],
+        'diagonal_copy': ['f16', 'f32'],
         'diagonal_scatter': ['f16', 'f32'],
         'diff': ['f16', 'f32'],
-        'dist': ['f32'],
+        'digamma': ['f32'],
+        'dist': ['f16', 'f32'],
+        'div': ['f16', 'f32'],
         'dot': ['f32'],
+        'double': ['f16', 'f32'],
+        'dsplit': ['f16', 'f32'],
+        'dstack': ['f16', 'f32'],
         'einsum': ['f32'],
+        'empty_like': ['f16', 'f32'],
+        'eq': ['f16', 'f32'],
         'erf': ['f32'],
+        'erfc': ['f32'],
+        'erfinv': ['f32'],
         'exp': ['f32'],
         'exp2': ['f16', 'f32'],
+        'expand': ['f16', 'f32'],
+        'expand_as': ['f16', 'f32'],
+        'expm1': ['f32'],
+        'fft.fftshift': ['f16', 'f32'],
+        'fft.hfft': ['f32'],
+        'fft.hfft2': ['f32'],
+        'fft.hfftn': ['f32'],
+        'fft.ifftshift': ['f16', 'f32'],
+        'fft.irfft': ['f32'],
+        'fft.irfft2': ['f32'],
+        'fft.irfftn': ['f32'],
         'fill': ['f16', 'f32'],
         'flatten': ['f16', 'f32'],
         'flip': ['f16', 'f32'],
         'fliplr': ['f16', 'f32'],
         'flipud': ['f16', 'f32'],
-        'float': ['f32'],
+        'float': ['f16', 'f32'],
+        'float_power': ['f16', 'f32'],
         'floor': ['f32'],
-        'gradient': ['f32'],
-        'half': ['f16'],
+        'fmax': ['f16', 'f32'],
+        'fmin': ['f16', 'f32'],
+        'fmod': ['f16', 'f32'],
+        'frac': ['f16', 'f32'],
+        'frexp': ['f16', 'f32'],
+        'full': ['f16', 'f32'],
+        'full_like': ['f16', 'f32'],
+        'gather': ['f16', 'f32'],
+        'ge': ['f16', 'f32'],
+        'gradient': ['f16', 'f32'],
+        'grid_sampler_2d': ['f32'],
+        'gt': ['f16', 'f32'],
+        'half': ['f16', 'f32'],
+        'histc': ['f32'],
+        'hsplit': ['f16', 'f32'],
         'hstack': ['f16', 'f32'],
-        'index_select': ['f16', 'f32'],
+        'hypot': ['f32'],
+        'i0': ['f32'],
         'index_add': ['f16', 'f32'],
+        'index_copy': ['f16', 'f32'],
+        'index_fill': ['f16', 'f32'],
+        'index_put': ['f16', 'f32'],
+        'index_reduce': ['f16', 'f32'],
+        'index_select': ['f16', 'f32'],
+        'inner': ['f32'],
+        'int': ['f16', 'f32'],
         'isclose': ['f16', 'f32'],
         'isfinite': ['f16', 'f32'],
+        'isin': ['f32'],
         'isinf': ['f16', 'f32'],
         'isnan': ['f16', 'f32'],
+        'isneginf': ['f16', 'f32'],
+        'isposinf': ['f16', 'f32'],
         'isreal': ['f16', 'f32'],
-        'kron': ['f32'],
-        'linalg.matrix_norm': ['f16'],
+        'kron': ['f16', 'f32'],
+        'kthvalue': ['f32'],
+        'ldexp': ['f16', 'f32'],
+        'le': ['f16', 'f32'],
+        'lerp': ['f32'],
+        'lgamma': ['f32'],
+        'linalg.cholesky': ['f32'],
+        'linalg.cholesky_ex': ['f32'],
+        'linalg.cond': ['f32'],
+        'linalg.cross': ['f32'],
+        'linalg.det': ['f32'],
+        'linalg.eigh': ['f32'],
+        'linalg.eigvalsh': ['f32'],
+        'linalg.householder_product': ['f32'],
+        'linalg.inv': ['f32'],
+        'linalg.inv_ex': ['f32'],
+        'linalg.ldl_factor': ['f32'],
+        'linalg.ldl_factor_ex': ['f32'],
+        'linalg.lstsq': ['f32'],
+        'linalg.lu': ['f32'],
+        'linalg.lu_factor': ['f32'],
+        'linalg.lu_factor_ex': ['f32'],
+        'linalg.lu_solve': ['f32'],
+        'linalg.matrix_norm': ['f16', 'f32'],
+        'linalg.matrix_power': ['f32'],
+        'linalg.matrix_rank': ['f32'],
+        'linalg.multi_dot': ['f32'],
+        'linalg.norm': ['f16', 'f32'],
+        'linalg.pinv': ['f32'],
+        'linalg.qr': ['f32'],
+        'linalg.slogdet': ['f32'],
+        'linalg.solve': ['f32'],
+        'linalg.solve_ex': ['f32'],
+        'linalg.solve_triangular': ['f32'],
         'linalg.svd': ['f32'],
+        'linalg.svdvals': ['f32'],
+        'linalg.tensorinv': ['f32'],
+        'linalg.tensorsolve': ['f32'],
+        'linalg.vander': ['f32'],
+        'linalg.vecdot': ['f32'],
+        'linalg.vector_norm': ['f16', 'f32'],
         'linspace': ['f16', 'f32'],
         'log': ['f32'],
         'log10': ['f32'],
         'log1p': ['f32'],
         'log2': ['f32'],
-        'log_softmax': ['f32'],
+        'log_softmax': ['f32', 'f16'],
         'logaddexp': ['f32'],
+        'logaddexp2': ['f32'],
+        'logcumsumexp': ['f32'],
+        'logdet': ['f32'],
+        'logical_and': ['f16', 'f32'],
         'logical_not': ['f16', 'f32'],
+        'logical_or': ['f16', 'f32'],
+        'logical_xor': ['f16', 'f32'],
+        'logit': ['f32'],
         'logspace': ['f32'],
+        'logsumexp': ['f32'],
+        'long': ['f16', 'f32'],
+        'lt': ['f16', 'f32'],
+        'lu': ['f32'],
+        'lu_solve': ['f32'],
+        'lu_unpack': ['f32'],
+        'mH': ['f16', 'f32'],
+        'mT': ['f16', 'f32'],
+        'masked.amax': ['f16', 'f32'],
+        'masked.amin': ['f16', 'f32'],
+        'masked.argmax': ['f16', 'f32'],
+        'masked.argmin': ['f16', 'f32'],
+        'masked.cumprod': ['f32'],
+        'masked.cumsum': ['f32'],
+        'masked.log_softmax': ['f32'],
+        'masked.logaddexp': ['f32'],
+        'masked.logsumexp': ['f32'],
+        'masked.mean': ['f16', 'f32'],
+        'masked.median': ['f32'],
+        'masked.norm': ['f16', 'f32'],
+        'masked.normalize': ['f16', 'f32'],
+        'masked.prod': ['f32'],
+        'masked.softmax': ['f32'],
+        'masked.softmin': ['f32'],
+        'masked.std': ['f32'],
+        'masked.sum': ['f16', 'f32'],
+        'masked.var': ['f16', 'f32'],
+        'masked_fill': ['f16', 'f32'],
+        'masked_scatter': ['f16', 'f32'],
+        'masked_select': ['f16', 'f32'],
         'matmul': ['f32'],
+        'matrix_exp': ['f32'],
+        'max': ['f16', 'f32'],
+        'max_pool2d_with_indices_backward': ['f32'],
+        'maximum': ['f16', 'f32'],
+        'mean': ['f16', 'f32'],
+        'median': ['f32'],
+        'meshgrid': ['f16', 'f32'],
+        'min': ['f16', 'f32'],
+        'minimum': ['f16', 'f32'],
         'mm': ['f32'],
+        'mode': ['f16', 'f32'],
+        'movedim': ['f16', 'f32'],
+        'msort': ['f16', 'f32'],
+        'mul': ['f16', 'f32'],
+        'multinomial': ['f32'],
         'mv': ['f32'],
+        'mvlgamma': ['f32'],
+        'nan_to_num': ['f16', 'f32'],
+        'nanmean': ['f16', 'f32'],
+        'nanmedian': ['f32'],
+        'nanquantile': ['f32'],
+        'nansum': ['f16', 'f32'],
+        'narrow': ['f16', 'f32'],
+        'native_batch_norm': ['f32'],
+        'native_dropout_backward': ['f16', 'f32'],
+        'native_layer_norm': ['f32'],
+        'ne': ['f16', 'f32'],
         'neg': ['f16', 'f32'],
-        'nn.functional.adaptive_max_pool1d': ['f32'],
-        'nn.functional.adaptive_max_pool2d': ['f32'],
+        'new_empty': ['f16', 'f32'],
+        'new_empty_strided': ['f16', 'f32'],
+        'new_full': ['f16', 'f32'],
+        'new_ones': ['f16', 'f32'],
+        'new_zeros': ['f16', 'f32'],
+        'nn.functional._scaled_dot_product_attention': ['f32'],
         'nn.functional.adaptive_avg_pool1d': ['f32'],
         'nn.functional.adaptive_avg_pool2d': ['f32'],
+        'nn.functional.adaptive_avg_pool3d': ['f16', 'f32'],
+        'nn.functional.adaptive_max_pool1d': ['f32'],
+        'nn.functional.adaptive_max_pool2d': ['f32'],
+        'nn.functional.adaptive_max_pool3d': ['f32'],
+        'nn.functional.alpha_dropout': ['f32'],
         'nn.functional.avg_pool1d': ['f32'],
         'nn.functional.avg_pool2d': ['f32'],
+        'nn.functional.avg_pool3d': ['f32'],
+        'nn.functional.batch_norm': ['f32'],
+        'nn.functional.bilinear': ['f32'],
         'nn.functional.binary_cross_entropy': ['f32'],
+        'nn.functional.binary_cross_entropy_with_logits': ['f32'],
         'nn.functional.celu': ['f32'],
         'nn.functional.conv1d': ['f32'],
         'nn.functional.conv2d': ['f32'],
         'nn.functional.conv_transpose1d': ['f32'],
+        'nn.functional.conv_transpose2d': ['f32'],
+        'nn.functional.conv_transpose3d': ['f32'],
         'nn.functional.cosine_embedding_loss': ['f32'],
+        'nn.functional.cosine_similarity': ['f32'],
+        'nn.functional.cross_entropy': ['f32'],
+        'nn.functional.ctc_loss': ['f32'],
+        'nn.functional.dropout': ['f32'],
+        'nn.functional.dropout2d': ['f32'],
+        'nn.functional.dropout3d': ['f32'],
         'nn.functional.elu': ['f32'],
-        'nn.functional.feature_alpha_dropout': ['f16', 'f32'],
+        'nn.functional.embedding': ['f16', 'f32'],
+        'nn.functional.embedding_bag': ['f16', 'f32'],
+        'nn.functional.feature_alpha_dropout': ['f32', 'f16'],
+        'nn.functional.fractional_max_pool2d': ['f32'],
+        'nn.functional.fractional_max_pool3d': ['f32'],
+        'nn.functional.gaussian_nll_loss': ['f32'],
+        'nn.functional.gelu': ['f32'],
         'nn.functional.glu': ['f32'],
+        'nn.functional.grid_sample': ['f32'],
+        'nn.functional.group_norm': ['f32'],
+        'nn.functional.hardshrink': ['f32'],
+        'nn.functional.hardsigmoid': ['f32'],
+        'nn.functional.hardswish': ['f32'],
         'nn.functional.hardtanh': ['f32'],
         'nn.functional.hinge_embedding_loss': ['f32'],
         'nn.functional.huber_loss': ['f16', 'f32'],
         'nn.functional.instance_norm': ['f32'],
+        'nn.functional.interpolate': ['f32'],
         'nn.functional.kl_div': ['f32'],
         'nn.functional.l1_loss': ['f16', 'f32'],
+        'nn.functional.layer_norm': ['f32'],
         'nn.functional.leaky_relu': ['f32'],
+        'nn.functional.linear': ['f32'],
         'nn.functional.local_response_norm': ['f32'],
+        'nn.functional.logsigmoid': ['f32'],
         'nn.functional.margin_ranking_loss': ['f32'],
         'nn.functional.max_pool1d': ['f32'],
         'nn.functional.max_pool2d': ['f32'],
+        'nn.functional.max_pool3d': ['f32'],
+        'nn.functional.max_unpool1d': ['f32'],
+        'nn.functional.max_unpool2d': ['f32'],
+        'nn.functional.max_unpool3d': ['f32'],
+        'nn.functional.mish': ['f32'],
         'nn.functional.mse_loss': ['f32'],
+        'nn.functional.multi_margin_loss': ['f32'],
+        'nn.functional.multilabel_margin_loss': ['f32'],
+        'nn.functional.multilabel_soft_margin_loss': ['f32'],
         'nn.functional.nll_loss': ['f32'],
-        'nn.functional.pad': ['f16', 'f32', 'i16', 'i32', 'i64'],
+        'nn.functional.normalize': ['f32'],
+        'nn.functional.pad': ['f16', 'f32'],
         'nn.functional.pairwise_distance': ['f16', 'f32'],
+        'nn.functional.pdist': ['f32'],
+        'nn.functional.pixel_shuffle': ['f16', 'f32'],
+        'nn.functional.pixel_unshuffle': ['f16', 'f32'],
         'nn.functional.poisson_nll_loss': ['f32'],
+        'nn.functional.prelu': ['f32'],
         'nn.functional.relu': ['f32'],
         'nn.functional.relu6': ['f32'],
+        'nn.functional.rrelu': ['f32'],
         'nn.functional.selu': ['f32'],
         'nn.functional.silu': ['f32'],
+        'nn.functional.smooth_l1_loss': ['f32'],
         'nn.functional.soft_margin_loss': ['f32'],
-        'nn.functional.softmin': ['f32'],
+        'nn.functional.softmin': ['f32', 'f16'],
         'nn.functional.softplus': ['f32'],
+        'nn.functional.softshrink': ['f32'],
         'nn.functional.softsign': ['f16', 'f32'],
-        'nn.functional.smooth_l1_loss': ['f32'],
+        'nn.functional.tanhshrink': ['f32'],
         'nn.functional.threshold': ['f32'],
         'nn.functional.triplet_margin_loss': ['f32'],
         'nn.functional.triplet_margin_with_distance_loss': ['f32'],
+        'nn.functional.unfold': ['f16', 'f32'],
         'nn.functional.upsample_bilinear': ['f32'],
-        'norm': ['f32', 'f16'],
+        'nn.functional.upsample_nearest': ['f32'],
+        'nonzero': ['f16', 'f32'],
+        'norm': ['f16', 'f32'],
+        'normal': ['f16', 'f32'],
+        'ones': ['f16', 'f32'],
+        'ones_like': ['f16', 'f32'],
+        'ormqr': ['f32'],
+        'outer': ['f16', 'f32'],
+        'pca_lowrank': ['f32'],
+        'permute': ['f16', 'f32'],
+        'pinverse': ['f32'],
+        'polygamma': ['f32'],
         'positive': ['f16', 'f32'],
+        'pow': ['f32'],
+        'prod': ['f32'],
+        'put': ['f16', 'f32'],
+        'qr': ['f32'],
+        'quantile': ['f32'],
         'rad2deg': ['f16', 'f32'],
+        'rand_like': ['f16', 'f32'],
+        'randint': ['f16', 'f32'],
+        'randint_like': ['f16', 'f32'],
+        'randn_like': ['f16', 'f32'],
+        'ravel': ['f16', 'f32'],
         'real': ['f16', 'f32'],
         'reciprocal': ['f16', 'f32'],
+        'remainder': ['f16', 'f32'],
+        'renorm': ['f16', 'f32'],
         'repeat': ['f16', 'f32'],
         'repeat_interleave': ['f16', 'f32'],
+        'reshape': ['f16', 'f32'],
+        'reshape_as': ['f16', 'f32'],
         'resolve_conj': ['f16', 'f32'],
         'resolve_neg': ['f16', 'f32'],
+        'roll': ['f16', 'f32'],
+        'rot90': ['f16', 'f32'],
         'round': ['f32'],
         'rsqrt': ['f32'],
+        'rsub': ['f16', 'f32'],
+        'scatter': ['f16', 'f32'],
+        'scatter_add': ['f16', 'f32'],
+        'scatter_reduce': ['f16', 'f32'],
+        'searchsorted': ['f16', 'f32'],
+        'segment_reduce': ['f16', 'f32'],
+        'select': ['f16', 'f32'],
         'select_scatter': ['f16', 'f32'],
+        'sgn': ['f16', 'f32'],
+        'short': ['f16', 'f32'],
+        'sigmoid': ['f32'],
         'sign': ['f16', 'f32'],
+        'signbit': ['f16', 'f32'],
         'sin': ['f32'],
+        'sinc': ['f32'],
         'sinh': ['f32'],
+        'slice': ['f16', 'f32'],
         'slice_scatter': ['f16', 'f32'],
-        'softmax': ['f32'],
+        'softmax': ['f32', 'f16'],
+        'sort': ['f16', 'f32'],
+        'special.airy_ai': ['f32'],
+        'special.bessel_j0': ['f32'],
+        'special.bessel_j1': ['f32'],
+        'special.bessel_y0': ['f32'],
+        'special.bessel_y1': ['f32'],
+        'special.chebyshev_polynomial_t': ['f32'],
+        'special.chebyshev_polynomial_u': ['f32'],
+        'special.entr': ['f32'],
+        'special.erfcx': ['f32'],
+        'special.hermite_polynomial_h': ['f32'],
+        'special.hermite_polynomial_he': ['f32'],
+        'special.i0e': ['f32'],
+        'special.i1': ['f32'],
+        'special.i1e': ['f32'],
+        'special.laguerre_polynomial_l': ['f32'],
+        'special.log_ndtr': ['f32'],
+        'special.modified_bessel_i0': ['f32'],
+        'special.modified_bessel_i1': ['f32'],
+        'special.modified_bessel_k0': ['f32'],
+        'special.modified_bessel_k1': ['f32'],
+        'special.ndtr': ['f32'],
+        'special.ndtri': ['f32'],
+        'special.polygamma': ['f32'],
+        'special.scaled_modified_bessel_k0': ['f32'],
+        'special.scaled_modified_bessel_k1': ['f32'],
+        'special.spherical_bessel_j0': ['f32'],
+        'special.xlog1py': ['f16', 'f32'],
         'split': ['f16', 'f32'],
+        'split_with_sizes': ['f16', 'f32'],
         'sqrt': ['f32'],
         'square': ['f16', 'f32'],
         'squeeze': ['f16', 'f32'],
         'stack': ['f16', 'f32'],
-        'sub': ['f32'],
+        'std': ['f16', 'f32'],
+        'std_mean': ['f16', 'f32'],
+        'sub': ['f16', 'f32'],
+        'sum': ['f16', 'f32'],
         'sum_to_size': ['f16', 'f32'],
         'svd': ['f32'],
+        'svd_lowrank': ['f32'],
+        'symeig': ['f32'],
         't': ['f16', 'f32'],
+        'take': ['f16', 'f32'],
+        'take_along_dim': ['f16', 'f32'],
+        'tan': ['f32'],
         'tanh': ['f32'],
+        'tensor_split': ['f16', 'f32'],
         'tensordot': ['f32'],
         'tile': ['f16', 'f32'],
+        'to': ['f16', 'f32'],
+        'topk': ['f32'],
+        'trace': ['f32'],
+        'transpose': ['f16', 'f32'],
+        'trapezoid': ['f16', 'f32'],
+        'trapz': ['f16', 'f32'],
+        'triangular_solve': ['f32'],
         'tril': ['f16', 'f32'],
         'triu': ['f16', 'f32'],
         'true_divide': ['f16', 'f32'],
         'trunc': ['f32'],
         'unbind': ['f16', 'f32'],
         'unflatten': ['f16', 'f32'],
+        'unfold': ['f16', 'f32'],
+        'unfold_copy': ['f16', 'f32'],
+        'uniform': ['f16', 'f32'],
         'unsqueeze': ['f16', 'f32'],
+        'var': ['f16', 'f32'],
+        'var_mean': ['f16', 'f32'],
+        'vdot': ['f32'],
         'view': ['f16', 'f32'],
         'view_as': ['f16', 'f32'],
+        'view_copy': ['f16', 'f32'],
         'vsplit': ['f16', 'f32'],
         'vstack': ['f16', 'f32'],
+        'where': ['f16', 'f32'],
+        'xlogy': ['f16', 'f32'],
         'zero_': ['f16', 'f32'],
-        'linalg.solve_triangular': ['f32'],
-        'triangular_solve': ['f32'],
-        '_native_batch_norm_legit': ['f32'],
-        'native_batch_norm': ['f32'],
-        'native_layer_norm': ['f32'],
-        'nn.functional.gelu': ['f32'],
+        'zeros': ['f16', 'f32'],
+        'zeros_like': ['f16', 'f32'],
+    }
+
+    BLOCKLIST_OP_GRAD = {
+        # Unimplemented ops
+        '__getitem__': ['f16'],
+        'combinations': ['f16', 'f32'],
+        'logaddexp2': ['f32'],
+        'masked_select': ['f16', 'f32'],
+        'nn.functional.binary_cross_entropy_with_logits': ['f16', 'f32'],
+        'nn.functional.group_norm': ['f32'],
+        'prod': ['f32'],
+        'sgn': ['f16', 'f32'],
+        'unfold_copy': ['f16', 'f32'],
+        'unfold': ['f16', 'f32'],
+        'trace': ['f32'],
+
+        # Correctness issues
+        'nn.functional.prelu': ['f32'],
+        'atanh': ['f32'],
+        'div': ['f16'],
+        'nn.functional.bilinear': ['f32'],
+        'nn.functional.embedding': ['f16'],
+
+        # Unsupported dtype
+        'special.ndtr': ['f32'],
+        'trapezoid': ['f16', 'f32'],
+        'trapz': ['f16', 'f32'],
+    }
+
+    BLOCKLIST_OP_GRAD_MACOS_12 = {
+        'remainder': ['f16'],
     }
 
     # These ops that are problematic. So never run them even when
@@ -9567,145 +10060,22 @@ class TestConsistency(TestCaseMPS):
     # If the dtype list is None, all dtypes are excluded.
     # All the entries in this list should be removed
     BLOCKLIST = {
-<<<<<<< HEAD
-        # Functions that hang
-        'masked_fill': [torch.bool, torch.uint8, torch.float32], 'where': [torch.bool],
-        # + forward when requires_grad=True or running backward
-        'masked.mean': [torch.bool, torch.float16],
-        'masked.prod': [torch.bool],
-        'masked.sum': [torch.bool],
-
         # Functions that hard crash
-        'std': [torch.float16],
-        'stft': [torch.float32], 'var': [torch.float16],
-        # + forward when requires_grad=True or running backward
-        'nn.functional.embedding': [torch.float32, torch.float16],
-        '__rpow__': [torch.int64],
-
-        'as_strided_scatter': [torch.uint8],
-        'atan2': [torch.int64],
-        'bfloat16': None,
-        'block_diag': [torch.uint8],
-        'byte': None,
-        'chalf': None,
-        'diag_embed': [torch.uint8],
-        'diagonal_scatter': [torch.uint8],
-        'long': None,
-        'nn.functional.conv1d': [torch.int64],
-        'nn.functional.conv2d': [torch.int64],
-        'nn.functional.conv_transpose1d': [torch.int64],
-        'nn.functional.conv_transpose2d': [torch.int64],
-        'nn.functional.conv_transpose3d': [torch.int64, torch.float32],
-        'nn.functional.local_response_norm': [torch.int64],
-        'nn.functional.padcircular': [torch.uint8],
-        'pow': [torch.int64],
-        'select_scatter': [torch.uint8],
-        'sigmoid': [torch.int64],
-
-
-        # failures due to lack of op implementation on MPS backend
-        'put': ['torch.bool', 'torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-
-        # These were moved from ALLOWLIST to BLOCK as they are not working
-        # locally
-        'tile': ['torch.float16', 'torch.float32', 'torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        '__radd__': ['torch.bool', 'torch.uint8'],
-        '__rmul__': ['torch.uint8'],
-        'neg': ['torch.uint8'],
-        'add': ['torch.bool', 'torch.uint8'],
-        'addr': ['torch.int16', 'torch.int32', 'torch.int64', 'torch.uint8'],
-        'diag': ['torch.int64'],
-        'diagflat': ['torch.int64'],
-
-        # Functions that are flaky
-        # These are detected as "ok" by the expect case but actually fail to run sometimes
-        'as_strided': None,
-        'broadcast_tensors': None,
-        'broadcast': None,
-        'broadcast_to': None,
-        'diagonal': None,
-        'divfloor_rounding': None,
-        'divno_rounding_mode': None,
-        'divtrunc_rounding': None,
-        'dsplit': None,
-        'hsplit': None,
-        'empty': None,
-        'expand_as': None,
-        'expand': None,
-        'ge': None,
-        'ne': None,
-        'le': None,
-        'lt': None,
-        'gt': None,
-        'transpose': None,
-        'splitlist_args': None,
-        'select': None,
-        'reshape': None,
-        'reshape_as': None,
-        'permute': None,
-        'norm': None,
-        'nn.functional.pixel_unshuffle': None,
-        'nn.functional.pixel_shuffle': None,
-        'nn.functional.cross_entropy': None,
-        'nn.functional.one_hot': None,
-        'narrow': None,
-        'movedim': None,
-        'minreduction_with_dim': None,
-        'minreduction_no_dim': None,
-        'minbinary': None,
-        'meshgridvariadic_tensors': None,
-        'meshgridlist_of_tensors': None,
-        'maxreduction_with_dim': None,
-        'maxreduction_no_dim': None,
-        'maxbinary': None,
-        'maximum': None,
-        'minimum': None,
-        'outer': None,
-        'softmaxwith_dtype': None,
-        'rounddecimals_neg_3': None,
-        'rounddecimals_3': None,
-        'rounddecimals_0': None,
-        'normnuc': None,
-        'nn.functional.softminwith_dtype': None,
-        'nn.functional.feature_alpha_dropoutwith_train': None,
-        'log_softmaxwith_dtype': None,
-        'split_with_sizes': None,
-        'trapezoid': None,
-        'eq': None,
-        'mul': None,
-        'cartesian_prod': None,
-        'bool': None,
-        'inner': None,
-        'dstack': None,
-        'take_along_dim': None,
-=======
-        # Functions that hard crash
-        'nn.functional.softplus': [torch.float32],
-        'median': [torch.float32, torch.int16, torch.int32, torch.uint8, torch.int16],
-        'sgn': [torch.bool],
-        'linalg.inv': [torch.float32],
-        'linalg.inv_ex': [torch.float32],
         'linalg.matrix_power': [torch.float32],
-        'nn.functional.interpolate': [torch.float32],
         'resize_': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'nn.functional.interpolatearea': [torch.float32],
         'resize_as_': [torch.float16, torch.float32],
         'topk': [torch.int16, torch.int32, torch.int64, torch.uint8],
 
         # Functions with correctness issues
-        'unique': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'divfloor_rounding': [torch.int16, torch.int32, torch.int64],
-        'divtrunc_rounding': [torch.float16],
-        'norm': [torch.float16],
         'nn.functional.feature_alpha_dropoutwith_train': [torch.float32],
-        'cumulative_trapezoid': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
-        'addr': [torch.float16],
-        'as_stridedpartial_views': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'trace': [torch.int64],
         'normalnumber_mean': [torch.float16, torch.float32],
         'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'multinomial': [torch.float32],
-        'floor_divide': [torch.int16, torch.int32, torch.int64],
+
+        # cpu result off, showing random values
+        'as_stridedpartial_views': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        # cpu result off, showing inf values
         'dist': [torch.float16],
 
         # failure due to issue: atan2() may generate NAN in output with
@@ -9715,12 +10085,12 @@ class TestConsistency(TestCaseMPS):
         'grid_sampler_2d': [torch.float32],
         'nn.functional.grid_sample': [torch.float32],
 
-        # failures due to issue #103039644: Wrong results from avgPooling2DWithSourceTensor()
-        # when both ceilMode and includeZeroPadToAverage are True
-        'nn.functional.avg_pool1d': [torch.float32, torch.int64],
-        'nn.functional.avg_pool2d': [torch.float32, torch.int64],
-        'nn.functional.adaptive_avg_pool1d': [torch.float32],
-        'nn.functional.adaptive_avg_pool2d': [torch.float32],
+        # failures due to issue #102048039: powerWithPrimaryTensor() with integer input may return wrong results
+        'pow': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rpow__': [torch.uint8],
+
+        # failures before macOS 13.3
+        'nn.functional.conv_transpose2d': [torch.float32],
     }
 
     UNIMPLEMENTED_OPS = {
@@ -9838,6 +10208,7 @@ class TestConsistency(TestCaseMPS):
         'nn.functional.fractional_max_pool3d': [torch.float32],
         'nn.functional.adaptive_avg_pool3d': [torch.float16, torch.float32],
         'nn.functional.adaptive_max_pool3d': [torch.float32],
+        'nn.functional.interpolatearea': [torch.float32],
         'nn.functional.interpolatebicubic': [torch.float32],
         'nn.functional.interpolatelinear': [torch.float32],
         'nn.functional.interpolatetrilinear': [torch.float32],
@@ -9847,7 +10218,6 @@ class TestConsistency(TestCaseMPS):
         'nn.functional.avg_pool3d': [torch.float32, torch.int64],
         'nn.functional.ctc_loss': [torch.float32],
         'nn.functional.embedding_bag': [torch.float16, torch.float32],
-        'nn.functional.max_pool2d': [torch.float32],
         'nn.functional.hardshrink': [torch.float32],
         'nn.functional.hardsigmoid': [torch.float32],
         'nn.functional.logsigmoid': [torch.float32],
@@ -9876,7 +10246,6 @@ class TestConsistency(TestCaseMPS):
         'polygammapolygamma_n_4': [torch.bool, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'qr': [torch.float32],
         'quantile': [torch.float32],
-        'remainder': [torch.bool, torch.int16, torch.int32, torch.int64, torch.uint8],
         'renorm': [torch.float16, torch.float32],
         'roll': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'rsub': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
@@ -9950,6 +10319,7 @@ class TestConsistency(TestCaseMPS):
         'symeig': [torch.float32],
         'take': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'to_sparse': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'unique': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'var_mean': [torch.float16, torch.float32],
         'var_meanunbiased': [torch.float16, torch.float32],
         'vdot': [torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
@@ -9977,6 +10347,7 @@ class TestConsistency(TestCaseMPS):
         'addmmdecomposed': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'addbmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'addmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        'addr': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'addmv': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'baddbmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'bmm': [torch.int16, torch.int32, torch.int64, torch.uint8],
@@ -10037,10 +10408,6 @@ class TestConsistency(TestCaseMPS):
         'tensordot': [torch.int16, torch.int32, torch.int64, torch.uint8],
         'zeros_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'bincount': [torch.int16, torch.int32, torch.int64, torch.uint8],
-
-        # failures due to issue #102048039: powerWithPrimaryTensor() with integer input may return wrong results
-        'pow': [torch.int16, torch.int32, torch.int64, torch.uint8],
-        '__rpow__': [torch.int16, torch.int32],
     }
 
     UNDEFINED_BEHAVIOUR = {
@@ -10064,15 +10431,19 @@ class TestConsistency(TestCaseMPS):
         'as_strided_scatter': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         # duplicate indices are used in the testcase - undefined behaviour
         'index_put': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
+        # problem 104760543, zero to negative integer powers are undefined
+        '__rpow__': [torch.int16, torch.int32, torch.int64],
     }
 
-    # Those ops worked on MacOS12, but broken on MacOS13
-    VENTURA_BLOCKLIST = {
-        'masked.softmax': [torch.float32],
+    FAST_MATH_PRECISION_ISSUES = {
+        # Failures due to precision issues
+        'tan': [torch.float32],
+        'pow': [torch.float32],
         'masked.softmin': [torch.float32],
+        'masked.softmax': [torch.float32],
         'masked.log_softmax': [torch.float32],
-        'dot': [torch.int64],
+        'cdist': [torch.float32],
+        '__rpow__': [torch.float32]
     }
 
     FP16_LOW_PRECISION_LIST = {
@@ -10082,28 +10453,63 @@ class TestConsistency(TestCaseMPS):
         'true_divide', 'kron',
         'gradient', 'var', 'std',
         'linalg.vector_norm',
-        'masked.sum', 'masked.std',
-        'masked.var',
+        'addr',
+
+        # for macOS 12
+        'masked.normalize', 'masked.sum',
+        'outer',
+        'sum_to_size',
     }
 
-<<<<<<< HEAD
-=======
-    dirname = os.path.dirname(__file__)
-    filename = os.path.join(dirname, "cuda_results.yaml")
-    with open(filename) as f:
-        data = yaml.safe_load(f)
-    CUDA_RESULT = dict()
-    for key, value in data.items():
-        CUDA_RESULT[key] = torch.as_tensor(value)
+    BLOCKLIST_MACOS_12 = {
+        '__rdiv__': [torch.float16],
+        'masked.var': [torch.float16],
+        'sum': [torch.float16],
+        'mul': [torch.float16],
+
+        # expected failures
+        'nn.functional.interpolatenearest': [torch.float32],
+        'nn.functional.upsample_nearest': [torch.float32],
+        'nn.functional.conv_transpose2d': [torch.float32]
+    }
+
+    ALLOWLIST_MACOS_13_3 = {
+        'pow': [torch.int16, torch.int32, torch.int64, torch.uint8],
+        '__rpow__': [torch.uint8],
+        'nn.functional.conv_transpose2d': [torch.float32],
+    }
 
     MPS_SKIP_LIST = reduce(lambda x, y: dict(x, **y), (
         FAST_MATH_PRECISION_ISSUES, BLOCKLIST, UNDEFINED_BEHAVIOUR, EXPECTED_FAILURES, UNIMPLEMENTED_OPS))
 
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
     # Used for accept mode only
     NEW_ALLOW_LIST = defaultdict(list)
     NEW_ALLOW_LIST_GRAD = defaultdict(list)
 
+    def get_error_message(self, key, op_name, dtype):
+        if key in self.FAST_MATH_PRECISION_ISSUES and dtype in self.FAST_MATH_PRECISION_ISSUES[key]:
+            return f"Running test with {op_name} fails due to precision issues (fast math) so skipping"
+        elif key in self.BLOCKLIST and dtype in self.BLOCKLIST[key]:
+            return f"Running test with {op_name} fails so skipping"
+        elif key in self.UNDEFINED_BEHAVIOUR and dtype in self.UNDEFINED_BEHAVIOUR[key]:
+            return f"Running test with {op_name} fails due to undefined behaviour / random output so skipping"
+        elif key in self.EXPECTED_FAILURES and dtype in self.EXPECTED_FAILURES[key]:
+            return f"Running test with {op_name} expected to fail due to unsupported MPS data type so skipping"
+        elif key in self.UNIMPLEMENTED_OPS and dtype in self.UNIMPLEMENTED_OPS[key]:
+            return f"Running test with {op_name} expected to fail due to missing op implementation"
+        elif product_version < 13.0 and key in self.BLOCKLIST_MACOS_12 and dtype in self.BLOCKLIST_MACOS_12[key]:
+            return f"Running test with {op_name} expected to fail on macOS 12"
+        return None
+
+    def compare_with_CUDA(self, op, mps_out, atol, rtol):
+        cuda_out = CUDA_RESULT[op.name]
+        try:
+            self.assertEqual(cuda_out, mps_out, atol=atol, rtol=rtol)
+        except Exception as e:
+            return False
+        else:
+            return True
+
     @ops(op_db, allowed_dtypes=MPS_DTYPES)
     def test_output_match(self, device, dtype, op):
         self.assertEqual(device, "cpu")
@@ -10111,13 +10517,15 @@ def test_output_match(self, device, dtype, op):
             self.skipTest("MPS is not available")
 
         key = op.name + op.variant_test_name
-
-        if key in self.VENTURA_BLOCKLIST and torch.backends.mps.is_macos13_or_newer():
-            if dtype in self.VENTURA_BLOCKLIST[key]:
-                self.skipTest(f"{key}_{dtype} fails on Ventura, see https://github.com/pytorch/pytorch/issues/85758")
-        if key in self.BLOCKLIST:
-            if self.BLOCKLIST[key] is None or dtype in self.BLOCKLIST[key]:
-                self.skipTest(f"Running test with {op.name} hangs so skipping")
+        if key in self.MPS_SKIP_LIST:
+            msg = self.get_error_message(key, op.name, dtype)
+            if msg is not None and not (product_version >= 13.3 and
+                                        key in self.ALLOWLIST_MACOS_13_3 and dtype in self.ALLOWLIST_MACOS_13_3[key]):
+                self.skipTest(msg)
+        if product_version < 13.0 and key in self.BLOCKLIST_MACOS_12:
+            msg = self.get_error_message(key, op.name, dtype)
+            if msg is not None:
+                self.skipTest(msg)
 
         # Make this an expecttest manually
         # When this env variable is set, generate a new ALLOWLIST_OP
@@ -10135,7 +10543,10 @@ def test_output_match(self, device, dtype, op):
                 if dtype_abbrs[dtype] not in self.ALLOWLIST_OP[op.name]:
                     self.skipTest(f"{op.name} is in the allow list for MPS but {dtype} is excluded")
 
-            if op.name not in self.ALLOWLIST_OP_GRAD or dtype_abbrs[dtype] not in self.ALLOWLIST_OP_GRAD[op.name]:
+            if (op.name not in self.ALLOWLIST_OP_GRAD or dtype_abbrs[dtype] not in self.ALLOWLIST_OP_GRAD[op.name] or
+               (op.name in self.BLOCKLIST_OP_GRAD and dtype_abbrs[dtype] in self.BLOCKLIST_OP_GRAD[op.name]) or
+               (product_version < 13.0 and op.name in self.BLOCKLIST_OP_GRAD_MACOS_12 and
+               dtype_abbrs[dtype] in self.BLOCKLIST_OP_GRAD_MACOS_12[op.name])):
                 run_grad_test = False
 
         def get_samples():
@@ -10167,7 +10578,7 @@ def get_samples():
                 cpu_out = op(*cpu_args, **cpu_kwargs)
                 mps_out = op(*mps_args, **mps_kwargs)
 
-                if op.name == "nn.functional.conv2d" and dtype == torch.float32:
+                if op.name == "nn.functional.conv2d" or op.name == "linalg.multi_dot" and dtype == torch.float32:
                     atol = 1e-4
                     rtol = 3e-5
                 elif (op.name in self.FP16_LOW_PRECISION_LIST) and dtype == torch.float16:
@@ -10179,6 +10590,11 @@ def get_samples():
                 elif (op.name == "native_layer_norm"):
                     atol = 1e-4
                     rtol = 1.3e-5
+                elif op.name == "norm" and dtype == torch.float16:
+                    atol = 7e-4
+                    rtol = 1.5e-3
+                elif op.name == "unique" and cpu_kwargs["sorted"] is False:
+                    continue
                 else:
                     atol = None
                     rtol = None
@@ -10186,16 +10602,11 @@ def get_samples():
                 self.assertEqual(cpu_out, mps_out, atol=atol, rtol=rtol)
 
             except Exception as e:
-<<<<<<< HEAD
                 if any(s in str(e).lower() for s in ["int64", "macos 13", "adaptive pool mps"]):
                     self.skipTest(f"Expected Runtime Error: {str(e)}")
-=======
-                if any(s in str(e).lower() for s in ["int64", "macos 13"]):
-                    self.skipTest(f"{str(e)}")
 
-                if op.name in self.CUDA_RESULT and self.compare_with_CUDA(op, mps_out, atol=atol, rtol=rtol):
+                if op.name in CUDA_RESULT and self.compare_with_CUDA(op, mps_out, atol=atol, rtol=rtol):
                     continue
->>>>>>> a6b4bc54f83 (Enable MPS CI runners (#252))
 
                 if not generate_new_truth:
                     raise e
@@ -10274,6 +10685,12 @@ def req_grad(t):
 # Copied from `TestCommon` in `test_ops.py`, just enough to duplicate the `test_numpy_ref` for MPS
 @skipIfSlowGradcheckEnv
 class TestCommon(TestCase):
+
+    UNIMPLEMENTED_OPS = {
+        'aminmax': [torch.float32],
+        'roll': [torch.float32],
+    }
+
     exact_dtype = True
 
     # Verifies, on teardown, that no OpInfo is still using dynamic dtypes in CI
@@ -10304,6 +10721,10 @@ def tearDownClass(cls):
     # MPS only supports float32
     @ops(_ref_test_ops, allowed_dtypes=(torch.float32,))
     def test_numpy_ref_mps(self, device, dtype, op):
+        key = op.name + op.variant_test_name
+        if key in self.UNIMPLEMENTED_OPS and dtype in self.UNIMPLEMENTED_OPS[key]:
+            self.skipTest(f"Running test with {op.name} expected to fail due to missing op implementation")
+
         # Unlike `test_numpy_ref`, this test compares in `float32` since at the time of this test's creation MPS
         # does not support float64 Tensors.
         # A few ops are currently broken on their reference inputs, but not their sample inputs. These should

From 66951a045edc5cf11dd670759a9bfe041c7dc3f1 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Tue, 14 Feb 2023 18:26:29 -0500
Subject: [PATCH 05/29] Remove torch._six from test_mps (#326)

---
 test/test_mps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 2085d0cebe72..b7907e7ed199 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -19,7 +19,7 @@
 import yaml
 import platform
 from collections import defaultdict
-from torch._six import inf
+from torch import inf
 from torch.nn import Parameter
 from torch.testing._internal import opinfo
 from torch.testing._internal.common_utils import \

From 5ada241bcea2c5d732596e142cd902009e727ec4 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 14 Feb 2023 15:27:02 -0800
Subject: [PATCH 06/29] Remove unnecessary CI files (#327)

* Remove unnecessary CI files

* Additional files

* Update lint
---
 .github/auto_request_review.yml           |  29 --
 .github/workflows/auto_request_review.yml |  22 --
 .github/workflows/lint.yml                | 237 --------------
 .github/workflows/pull.yml                | 368 ----------------------
 .github/workflows/run_torchbench.yml      | 103 ------
 5 files changed, 759 deletions(-)
 delete mode 100644 .github/auto_request_review.yml
 delete mode 100644 .github/workflows/auto_request_review.yml
 delete mode 100644 .github/workflows/pull.yml
 delete mode 100644 .github/workflows/run_torchbench.yml

diff --git a/.github/auto_request_review.yml b/.github/auto_request_review.yml
deleted file mode 100644
index 765fd1715e89..000000000000
--- a/.github/auto_request_review.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Documented at https://github.com/necojackarc/auto-request-review
-reviewers:
-  groups:
-    symbolic-shapes:
-      - ezyang
-      - Chillee
-      - albanD
-      - miladm
-      - bdhirsh
-      - voznesenskym
-      - jbschlosser
-
-  per_author:
-    symbolic-shapes:
-      - symbolic-shapes
-      - antoniojkim
-      - wconstab
-      - SherlockNoMad
-
-files:
-  # none yet, TODO: migrate CODEOWNERS here
-
-options:
-  ignore_draft: true
-  ignored_keywords:
-    - DO NOT REVIEW
-  # Just manually setup a self-referential per_author rule if you
-  # want group assignment
-  enable_group_assignment: false
diff --git a/.github/workflows/auto_request_review.yml b/.github/workflows/auto_request_review.yml
deleted file mode 100644
index 7c98c2990fba..000000000000
--- a/.github/workflows/auto_request_review.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: Auto Request Review
-
-on:
-  pull_request:
-    types: [opened, ready_for_review, reopened]
-
-jobs:
-  auto-request-review:
-    # Don't run on forked repos
-    if: ${{ !github.event.pull_request.head.repo.fork }}
-    name: Auto Request Review
-    runs-on: ubuntu-latest
-    steps:
-      - name: Request review based on files changes and/or groups the author belongs to
-        # v0.7.0
-        uses: necojackarc/auto-request-review@e08cdffa277d50854744de3f76230260e61c67f4
-        with:
-          token: ${{ secrets.GITHUB_TOKEN }}
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 98a941d48b83..0b846bc5a90f 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -10,244 +10,7 @@ on:
 # The names of steps that actually test the code should be suffixed with `(nonretryable)`.
 # When any other step fails, it's job will be retried once by retryBot.
 jobs:
-  docker-image:
-    name: docker-image
-    uses: ./.github/workflows/_calculate-docker-image.yml
-    with:
-      docker-image-name: pytorch-linux-focal-linter
-
   lintrunner:
-    needs: docker-image
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: ${{ needs.docker-image.outputs.docker-image }}
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        CACHE_DIRECTORY="/tmp/.lintbin"
-        # Try to recover the cached binaries
-        if [[ -d "${CACHE_DIRECTORY}" ]]; then
-          # It's ok to fail this as lintrunner init would download these binaries
-          # again if they do not exist
-          cp -r "${CACHE_DIRECTORY}" . || true
-        fi
-
-        # This has already been cached in the docker image
-        lintrunner init 2> /dev/null
-
-        # Do build steps necessary for linters
-        python3 -m tools.linter.clang_tidy.generate_build_files
-        python3 -m tools.generate_torch_version --is_debug=false
-        python3 -m tools.pyi.gen_pyi \
-          --native-functions-path aten/src/ATen/native/native_functions.yaml \
-          --tags-path aten/src/ATen/native/tags.yaml \
-          --deprecated-functions-path "tools/autograd/deprecated.yaml"
-
-        RC=0
-        # Run lintrunner on all files
-        if ! lintrunner --force-color --all-files --tee-json=lint.json 2> /dev/null; then
-          echo ""
-          echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
-          echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
-          RC=1
-        fi
-
-        # Use jq to massage the JSON lint output into GitHub Actions workflow commands.
-        jq --raw-output \
-          '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \
-          lint.json || true
-
-        exit $RC
-
-  quick-checks:
-    needs: docker-image
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: ${{ needs.docker-image.outputs.docker-image }}
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Ensure no non-breaking spaces
-        # NB: We use 'printf' below rather than '\u000a' since bash pre-4.2
-        # does not support the '\u000a' syntax (which is relevant for local linters)
-        (! git --no-pager grep -In "$(printf '\xC2\xA0')" -- . || (echo "The above lines have non-breaking spaces (U+00A0); please convert them to spaces (U+0020)"; false))
-
-        # Ensure cross-OS compatible file names
-        (! git ls-files | grep -E '([<>:"|?*]|[ .]$)' || (echo "The above file names are not valid across all operating systems. Please ensure they don't contain the characters '<>:""|?*' and don't end with a white space or a '.' "; false))
-
-        # Ensure no versionless Python shebangs
-        (! git --no-pager grep -In '#!.*python$' -- . || (echo "The above lines have versionless Python shebangs; please specify either python2 or python3"; false))
-
-        # Ensure ciflow tags mentioned in config
-        python3 .github/scripts/collect_ciflow_labels.py --validate-tags
-
-        # C++ docs check
-        pushd docs/cpp/source
-        ./check-doxygen.sh
-        popd
-
-        # CUDA kernel launch check
-        set -eux
-        python3 torch/testing/_internal/check_kernel_launches.py |& tee cuda_kernel_launch_checks.txt
-
-  pr-sanity-checks:
-    name: pr-sanity-checks
-    runs-on: [self-hosted, linux.large]
-    # Only run this on pull requests. This check is simple enough to be done without a Docker image
-    if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'skip-pr-sanity-checks')
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: -1
-
-      - name: PR size check (nonretryable)
-        env:
-          BASE: ${{ github.event.pull_request.base.sha }}
-          HEAD: ${{ github.event.pull_request.head.sha }}
-        run: |
-          bash .github/scripts/pr-sanity-check.sh
-
-  workflow-checks:
-    needs: docker-image
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: ${{ needs.docker-image.outputs.docker-image }}
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Regenerate workflows
-        .github/scripts/generate_ci_workflows.py
-
-        RC=0
-        # Assert that regenerating the workflows didn't change them
-        if ! .github/scripts/report_git_status.sh .github/workflows; then
-          echo
-          echo 'As shown by the above diff, the committed .github/workflows'
-          echo 'are not up to date according to .github/templates.'
-          echo 'Please run this command, commit, and push again to your PR:'
-          echo
-          echo '    .github/scripts/generate_ci_workflows.py'
-          echo
-          echo 'If running that command does nothing, you may need to rebase'
-          echo 'onto a more recent commit from the PyTorch master branch.'
-          RC=1
-        fi
-
-        # Check that jobs will be cancelled
-        .github/scripts/ensure_actions_will_cancel.py
-
-        exit $RC
-
-  toc:
-    needs: docker-image
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: ${{ needs.docker-image.outputs.docker-image }}
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Regenerate ToCs and check that they didn't change
-        set -eu
-
-        export PATH=~/.npm-global/bin:"$PATH"
-        for FILE in $(git grep -Il '<!-- toc -->' -- '**.md'); do
-          markdown-toc --bullets='-' -i "$FILE"
-        done
-
-        if ! .github/scripts/report_git_status.sh .; then
-          echo
-          echo 'As shown by the above diff, the table of contents in one or'
-          echo 'more Markdown files is not up to date with the file contents.'
-          echo 'You can either apply that Git diff directly to correct the'
-          echo 'table of contents, or if you have npm installed, you can'
-          echo 'install the npm package markdown-toc and run the following'
-          # shellcheck disable=SC2016
-          echo 'command (replacing $FILE with the filename for which you want'
-          echo 'to regenerate the table of contents):'
-          echo
-          # shellcheck disable=SC2016
-          echo "    markdown-toc --bullets='-' -i \"\$FILE\""
-          false
-        fi
-
-  test-tools:
-    name: Test tools
-    if: ${{ github.repository == 'pytorch/pytorch' }}
-    needs: docker-image
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
-    with:
-      runner: linux.2xlarge
-      docker-image: ${{ needs.docker-image.outputs.docker-image }}
-      fetch-depth: 0
-      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Test tools
-        python3 -m unittest discover -vs tools/test -p 'test_*.py'
-        python3 -m unittest discover -vs .github/scripts -p 'test_*.py'
-
-  test_collect_env:
-    if: ${{ github.repository == 'pytorch/pytorch' }}
-    name: Test collect_env
-    runs-on: linux.20_04.4x
-    strategy:
-      matrix:
-        test_type: [with_torch, without_torch, older_python_version]
-    steps:
-      # [see note: pytorch repo ref]
-      # deep clone (fetch-depth 0) required, to allow us to use git log
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
-        with:
-          submodules: false
-          fetch-depth: 1
-      - name: Setup Python 3.5
-        if: matrix.test_type == 'older_python_version'
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.5'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/requirements.txt
-      - name: Setup Python 3.8
-        if: matrix.test_type != 'older_python_version'
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.8'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/requirements.txt
-      - name: Install torch
-        if: matrix.test_type == 'with_torch'
-        run: |
-          pip install -r requirements.txt
-          # Doesn't really matter what torch version, we just need ANY torch installed
-          pip install 'torch==1.*'
-      - name: Run collect_env.py (nonretryable)
-        run: |
-          # All we need to see is that it passes
-          python3 torch/utils/collect_env.py
-
     runs-on: macos-m1-12
     steps:
       - name: Checkout PyTorch
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
deleted file mode 100644
index 2c5493639e4e..000000000000
--- a/.github/workflows/pull.yml
+++ /dev/null
@@ -1,368 +0,0 @@
-name: pull
-
-on:
-  pull_request:
-  push:
-    branches:
-      - master
-      - main
-      - release/*
-      - landchecks/*
-  workflow_dispatch:
-  schedule:
-    - cron: 29 8 * * *  # about 1:29am PDT
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-jobs:
-  linux-focal-py3_8-gcc7-build:
-    name: linux-focal-py3.8-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.8-gcc7
-      docker-image-name: pytorch-linux-focal-py3.8-gcc7
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "docs_test", shard: 1, num_shards: 1,  runner: "linux.2xlarge" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "backwards_compat", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-focal-py3_8-gcc7-test:
-    name: linux-focal-py3.8-gcc7
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_8-gcc7-build
-    with:
-      build-environment: linux-focal-py3.8-gcc7
-      docker-image: ${{ needs.linux-focal-py3_8-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_8-gcc7-build.outputs.test-matrix }}
-
-  linux-docs:
-    name: linux-docs
-    uses: ./.github/workflows/_docs.yml
-    needs: linux-focal-py3_8-gcc7-build
-    with:
-      build-environment: linux-focal-py3.8-gcc7
-      docker-image: ${{ needs.linux-focal-py3_8-gcc7-build.outputs.docker-image }}
-
-  linux-focal-py3_8-gcc7-no-ops:
-    name: linux-focal-py3.8-gcc7-no-ops
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.8-gcc7-no-ops
-      docker-image-name: pytorch-linux-focal-py3.8-gcc7
-
-  linux-focal-py3_8-gcc7-pch:
-    name: linux-focal-py3.8-gcc7-pch
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.8-gcc7-pch
-      docker-image-name: pytorch-linux-focal-py3.8-gcc7
-
-  linux-focal-py3_9-clang7-asan-build:
-    name: linux-focal-py3.9-clang7-asan
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.9-clang7-asan
-      docker-image-name: pytorch-linux-focal-py3-clang7-asan
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "default", shard: 2, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "default", shard: 3, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-focal-py3_9-clang7-asan-test:
-    name: linux-focal-py3.9-clang7-asan
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_9-clang7-asan-build
-    with:
-      build-environment: linux-focal-py3.9-clang7-asan
-      docker-image: ${{ needs.linux-focal-py3_9-clang7-asan-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_9-clang7-asan-build.outputs.test-matrix }}
-
-  linux-focal-py3_8-clang10-onnx-build:
-    name: linux-focal-py3.8-clang10-onnx
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.8-clang10-onnx
-      docker-image-name: pytorch-linux-focal-py3-clang10-onnx
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-        ]}
-
-  linux-focal-py3_8-clang10-onnx-test:
-    name: linux-focal-py3.8-clang10-onnx
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_8-clang10-onnx-build
-    with:
-      build-environment: linux-focal-py3.8-clang10-onnx
-      docker-image: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.test-matrix }}
-
-  linux-bionic-py3_8-clang9-build:
-    name: linux-bionic-py3.8-clang9
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-py3.8-clang9
-      docker-image-name: pytorch-linux-bionic-py3.8-clang9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-bionic-py3_8-clang9-test:
-    name: linux-bionic-py3.8-clang9
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-py3_8-clang9-build
-    with:
-      build-environment: linux-bionic-py3.8-clang9
-      docker-image: ${{ needs.linux-bionic-py3_8-clang9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-py3_8-clang9-build.outputs.test-matrix }}
-
-  linux-bionic-py3_11-clang9-build:
-    name: linux-bionic-py3.11-clang9
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-py3.11-clang9
-      docker-image-name: pytorch-linux-bionic-py3.11-clang9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-bionic-py3_11-clang9-test:
-    name: linux-bionic-py3.11-clang9
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-py3_11-clang9-build
-    with:
-      build-environment: linux-bionic-py3.11-clang9
-      docker-image: ${{ needs.linux-bionic-py3_11-clang9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-py3_11-clang9-build.outputs.test-matrix }}
-
-  linux-vulkan-bionic-py3_11-clang9-build:
-    name: linux-vulkan-bionic-py3.11-clang9
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-vulkan-bionic-py3.11-clang9
-      docker-image-name: pytorch-linux-bionic-py3.11-clang9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-        ]}
-
-  linux-vulkan-bionic-py3_11-clang9-test:
-    name: linux-vulkan-bionic-py3.11-clang9
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-vulkan-bionic-py3_11-clang9-build
-    with:
-      build-environment: linux-vulkan-bionic-py3.11-clang9
-      docker-image: ${{ needs.linux-vulkan-bionic-py3_11-clang9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-vulkan-bionic-py3_11-clang9-build.outputs.test-matrix }}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-build:
-    name: linux-bionic-cuda11.7-py3.10-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "deploy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-test:
-    name: linux-bionic-cuda11.7-py3.10-gcc7
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3_10-gcc7-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-build.outputs.test-matrix }}
-
-  linux-focal-py3-clang7-mobile-build:
-    name: linux-focal-py3-clang7-mobile-build
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3-clang7-mobile-build
-      docker-image-name: pytorch-linux-focal-py3-clang7-asan
-      build-generates-artifacts: false
-
-  linux-jammy-cuda-11_7-cudnn8-py3_8-clang12-build:
-    name: linux-jammy-cuda11.7-cudnn8-py3.8-clang12
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-jammy-cuda11.7-cudnn8-py3.8-clang12
-      docker-image-name: pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12
-
-  linux-focal-py3-clang7-mobile-custom-build-static:
-    name: linux-focal-py3-clang7-mobile-custom-build-static
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3-clang7-mobile-custom-build-static
-      docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
-      build-generates-artifacts: false
-
-  linux-bionic-py3_8-clang8-xla-build:
-    name: linux-bionic-py3_8-clang8-xla
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-py3_8-clang8-xla
-      docker-image-name: xla_base
-      test-matrix: |
-        { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "linux.4xlarge" },
-        ]}
-
-  linux-bionic-py3_8-clang8-xla-test:
-    name: linux-bionic-py3_8-clang8-xla
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-py3_8-clang8-xla-build
-    with:
-      build-environment: linux-bionic-py3_8-clang8-xla
-      docker-image: ${{ needs.linux-bionic-py3_8-clang8-xla-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-py3_8-clang8-xla-build.outputs.test-matrix }}
-
-  win-vs2019-cpu-py3-build:
-    name: win-vs2019-cpu-py3
-    uses: ./.github/workflows/_win-build.yml
-    with:
-      build-environment: win-vs2019-cpu-py3
-      cuda-version: cpu
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "windows.4xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "windows.4xlarge" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
-        ]}
-
-  win-vs2019-cpu-py3-test:
-    name: win-vs2019-cpu-py3
-    uses: ./.github/workflows/_win-test.yml
-    needs: win-vs2019-cpu-py3-build
-    with:
-      build-environment: win-vs2019-cpu-py3
-      cuda-version: cpu
-      test-matrix: ${{ needs.win-vs2019-cpu-py3-build.outputs.test-matrix }}
-
-  win-vs2019-cuda11_7-py3-build:
-    if: github.event_name == 'pull_request'
-    name: win-vs2019-cuda11.7-py3
-    uses: ./.github/workflows/_win-build.yml
-    with:
-      build-environment: win-vs2019-cuda11.7-py3
-      cuda-version: "11.7"
-      sync-tag: win-cuda-build
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-bazel-test:
-    name: linux-bionic-cuda11.7-py3.10-gcc7-bazel-test
-    uses: ./.github/workflows/_bazel-build-test.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-bazel-test
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-
-  linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single:
-    name: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single
-    uses: ./.github/workflows/_android-build-test.yml
-    with:
-      build-environment: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single
-      docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
-
-  linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit:
-    name: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit
-    uses: ./.github/workflows/_android-build-test.yml
-    with:
-      build-environment: linux-focal-py3-clang7-android-ndk-r19c-gradle-custom-build-single-full-jit
-      docker-image-name: pytorch-linux-focal-py3-clang7-android-ndk-r19c
-
-  linux-focal-py3_8-gcc7-mobile-lightweight-dispatch-build:
-    name: linux-focal-py3.8-gcc7-mobile-lightweight-dispatch-build
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-py3.8-gcc7-mobile-lightweight-dispatch-build
-      docker-image-name: pytorch-linux-focal-py3.8-gcc7
-      build-generates-artifacts: false
-
-  linux-focal-rocm5_4_2-py3_8-build:
-    # don't run build twice on master
-    if: github.event_name == 'pull_request'
-    name: linux-focal-rocm5.4.2-py3.8
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-rocm5.4.2-py3.8
-      docker-image-name: pytorch-linux-focal-rocm-n-py3
-      sync-tag: rocm-build
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-sm86-build:
-    name: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      cuda-arch-list: 8.6
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "slow", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "slow", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "functorch", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-sm86-test:
-    name: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3_10-gcc7-sm86-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-sm86-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-sm86-build.outputs.test-matrix }}
diff --git a/.github/workflows/run_torchbench.yml b/.github/workflows/run_torchbench.yml
deleted file mode 100644
index 8d55f6a9479c..000000000000
--- a/.github/workflows/run_torchbench.yml
+++ /dev/null
@@ -1,103 +0,0 @@
-name: TorchBench CI (pytorch-linux-py3.8-cu116)
-on:
-  pull_request:
-
-env:
-  PYTHON_VERSION: "3.8"
-  # must be consistent with https://github.com/pytorch/benchmark/blob/main/requirements.txt#L19
-  NUMPY_VERSION: "1.21.2"
-  SETUP_SCRIPT: "/data/nvme/bin/setup_instance.sh"
-  PR_NUM: ${{ github.event.number }}
-  PR_BODY: ${{ github.event.pull_request.body }}
-  PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
-  PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
-
-jobs:
-  run-torchbench:
-    # We don't accept running on non-pytorch repos because of security concerns
-    # Only run the job when the body contains magic word "RUN_TORCHBENCH:"
-    if: ${{ github.repository_owner == 'pytorch' && contains(github.event.pull_request.body, 'RUN_TORCHBENCH:') }}
-    runs-on: [self-hosted, bm-runner]
-    # Set to 12 hours
-    timeout-minutes: 720
-    steps:
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          path: pytorch
-      - name: Update self-hosted PyTorch
-        run: |
-          pushd "${HOME}"/pytorch
-          git remote prune origin
-          git fetch
-          popd
-      - name: Create conda environment and install deps
-        run: |
-          conda create -y -n pr-ci python="${PYTHON_VERSION}"
-          # shellcheck source=/dev/null
-          . "${SETUP_SCRIPT}"
-          conda activate pr-ci
-          conda install -y numpy="${NUMPY_VERSION}" requests ninja pyyaml mkl mkl-include \
-                           setuptools cmake=3.22.* typing-extensions boto3 \
-                           pillow pytest tabulate gitpython git-lfs tqdm psutil
-          pip install --pre torch torchvision torchtext -f https://download.pytorch.org/whl/nightly/cu116/torch_nightly.html
-      - name: Setup TorchBench branch
-        run: |
-          # shellcheck source=/dev/null
-          . "${SETUP_SCRIPT}"
-          conda activate pr-ci
-          PR_BODY_FILE=/tmp/pr-body.txt
-          echo "$PR_BODY" > ${PR_BODY_FILE}
-          python pytorch/.github/scripts/run_torchbench.py --pr-body "${PR_BODY_FILE}" set-torchbench-branch
-      - name: Checkout TorchBench
-        uses: malfet/checkout@silent-checkout
-        with:
-          repository: pytorch/benchmark
-          path: benchmark
-          lfs: false
-          ref: ${{ env.TORCHBENCH_BRANCH }}
-      - name: GPU Info
-        run: |
-          nvidia-smi
-      - name: Run TorchBench
-        run: |
-          set -x
-          pushd "${HOME}"/pytorch
-          PR_MERGE_BASE=$(git merge-base "$PR_BASE_SHA" "$PR_HEAD_SHA")
-          popd
-          PR_BODY_FILE=/tmp/pr-body.txt
-          echo "$PR_BODY" > ${PR_BODY_FILE}
-          # shellcheck source=/dev/null
-          . "${SETUP_SCRIPT}"
-          conda activate pr-ci
-          python3 pytorch/.github/scripts/run_torchbench.py \
-                  --pr-body "$PR_BODY_FILE" \
-                  run \
-                  --pytorch-path "${HOME}"/pytorch \
-                  --torchbench-path "${PWD}"/benchmark \
-                  --pr-num "$PR_NUM" \
-                  --pr-base-sha "$PR_MERGE_BASE" \
-                  --pr-head-sha "$PR_HEAD_SHA"
-      - name: Upload result to S3
-        run: |
-          # shellcheck source=/dev/null
-          . "${SETUP_SCRIPT}"
-          conda activate pr-ci
-          python3 pytorch/.github/scripts/run_torchbench.py \
-                  upload-s3 \
-                  --result-dir "${HOME}/.torchbench/bisection/pr${{ github.event.number }}"
-      - name: Remove conda environment and cleanup
-        run: |
-          conda env remove --name pr-ci
-          rm /tmp/pr-body.txt
-      - name: Upload artifact
-        uses: actions/upload-artifact@v3
-        with:
-          name: TorchBench result
-          path: ~/.torchbench/bisection/pr${{ github.event.number }}
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true

From bf8eba99a4f0435aaee9b4abd3123fbeef94d4a2 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 14 Feb 2023 15:28:17 -0800
Subject: [PATCH 07/29] Enable test modules on MPS and CI runners (#305) (#324)

* Enable test modules on MPS and CI runners

* Update lint.yml

* Update comments

* Retrigger CI

* Retrigger CI #2

* Remove comment
---
 .github/workflows/_mac-test-mps.yml | 14 +++++
 .github/workflows/lint.yml          |  2 +-
 test/test_modules.py                | 81 ++++++++++++++++++++++++++---
 3 files changed, 89 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/_mac-test-mps.yml b/.github/workflows/_mac-test-mps.yml
index 1fcafb6db66f..f9c402a772ac 100644
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@@ -83,6 +83,20 @@ jobs:
           set -ex
           ${CONDA_RUN} python3 test/run_test.py --mps --verbose
 
+      - name: Run MPS Test Modules
+        id: test_2
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+        shell: arch -arch arm64 bash {0}
+        # During bring up of test_modules don't show this as an error.
+        continue-on-error: true
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          # TODO(https://github.com/pytorch/pytorch/issues/79293)
+
+          ${CONDA_RUN} python3 test/test_modules.py -k mps --verbose
+
       - name: Print remaining test logs
         shell: bash
         if: always()
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 0b846bc5a90f..58566ebc3746 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -70,7 +70,7 @@ jobs:
           # shellcheck disable=SC1090
           set -ex
           set +e
-          if ! ${CONDA_RUN} lintrunner --force-color aten/src/ATen/native/mps/operations/* test/test_mps.py; then
+          if ! ${CONDA_RUN} lintrunner --force-color aten/src/ATen/native/mps/operations/* test/test_mps.py test/test_modules.py; then
               echo ""
               echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
               echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
diff --git a/test/test_modules.py b/test/test_modules.py
index 2ae17f5f8cf8..9c244fb65e60 100644
--- a/test/test_modules.py
+++ b/test/test_modules.py
@@ -10,12 +10,23 @@
 from torch.testing._internal.common_cuda import with_tf32_off
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCUDA, toleranceOverride, tol, skipMeta)
+from torch.testing._internal.common_dtype import get_all_dtypes
 from torch.testing._internal.common_modules import module_db, modules, TrainEvalMode
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, freeze_rng_state, mock_wrapper, get_tensors_from, gradcheck,
-    gradgradcheck, skipIfMps, skipIfTorchInductor)
+    gradgradcheck, skipIfTorchInductor)
 from unittest.mock import patch, call
 
+MPS_DTYPES = get_all_dtypes()
+for t in [torch.double, torch.cdouble, torch.cfloat, torch.int8, torch.bfloat16]:
+    del MPS_DTYPES[MPS_DTYPES.index(t)]
+
+def _get_mps_error_msg(device, dtype, op, mps_blocklist):
+    if torch.backends.mps.is_available() and device == "mps" and dtype not in MPS_DTYPES:
+        return f"MPS doesn't support {str(dtype)} datatype"
+    if op.name.startswith(tuple(mps_blocklist)):
+        return "MPS doesn't support op " + str(op.name)
+    return None
 
 class TestModule(TestCase):
     _do_cuda_memory_leak_check = True
@@ -33,7 +44,8 @@ def _assert_module_parameters_and_buffer_are(self, module, device, dtype):
         def _check_module(items, name, device=device, dtype=dtype):
             for item_name, item in items:
                 self.assertEqual(
-                    item.device, device,
+                    # workaround for the tests checking the device (mps:0 with mps)
+                    item.device.type, device.type,
                     f'{name} {item_name} is on device {item.device} instead of the expected device {device}')
                 if item.dtype.is_floating_point:
                     self.assertEqual(
@@ -42,9 +54,16 @@ def _check_module(items, name, device=device, dtype=dtype):
         _check_module(module.named_parameters(), "Parameter")
         _check_module(module.named_buffers(), "Buffer")
 
-    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     @modules(module_db)
     def test_forward(self, device, dtype, module_info, training):
+        MPS_BLOCKLIST = [
+            "nn.LSTM"  # segfault
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)
@@ -84,6 +103,10 @@ def test_forward(self, device, dtype, module_info, training):
     # They should be applied to any created parameters and buffers.
     @modules(module_db)
     def test_factory_kwargs(self, device, dtype, module_info, training):
+        msg = _get_mps_error_msg(device, dtype, module_info, [])
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)
@@ -198,6 +221,11 @@ def _to_device1(objs):
     @modules(module_db)
     def test_repr(self, device, dtype, module_info, training):
         # Test module can be represented with repr and str without errors.
+
+        msg = _get_mps_error_msg(device, dtype, module_info, [])
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)
@@ -211,10 +239,19 @@ def test_repr(self, device, dtype, module_info, training):
             m.__repr__()
             str(m)
 
-    @skipIfMps
     @modules(module_db)
     def test_pickle(self, device, dtype, module_info, training):
         # Test that module can be pickled and unpickled.
+
+        MPS_BLOCKLIST = [
+            "nn.LSTM"  # hard crash
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)
@@ -249,6 +286,15 @@ def test_pickle(self, device, dtype, module_info, training):
     def test_check_inplace(self, device, dtype, module_info, training):
         # Check if the inplace variant of the module gives the same result as the out of place
         # variant.
+
+        MPS_BLOCKLIST = [
+            "nn.ELU"  # hard crash
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=True, training=training)
@@ -326,11 +372,21 @@ def inner_zero_grad(obj):
                 obj.grad = None
         self._traverse_obj(obj, inner_zero_grad)
 
-    @skipIfMps
     @modules(module_db)
     @skipIfTorchInductor("to be fixed")
     def test_non_contiguous_tensors(self, device, dtype, module_info, training):
         # Check modules work with non-contiguous tensors
+        MPS_BLOCKLIST = [
+            # hard crashes
+            "nn.GRU",
+            "nn.LSTM",
+            "nn.RNN"
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
 
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
@@ -582,10 +638,18 @@ def check_backward(cpu_output, gpu_output):
                     for cpu_output, gpu_output in zip(flatten_cpu_outputs, flatten_gpu_outputs):
                         check_backward(cpu_output, gpu_output)
 
-    @skipIfMps
     @modules(module_db)
     @skipIfTorchInductor("to be fixed")
     def test_memory_format(self, device, dtype, module_info, training):
+        MPS_BLOCKLIST = [
+            "nn.BatchNorm3d",  # failed assert
+            "nn.LSTM",  # segfault
+        ]
+
+        msg = _get_mps_error_msg(device, dtype, module_info, MPS_BLOCKLIST)
+        if msg is not None:
+            self.skipTest(msg)
+
         is_sm86 = device.startswith("cuda") and torch.cuda.get_device_capability(0) == (8, 6)
         # TODO tighten it to a specific module
         atol, rtol = (3e-3, 7e-3) if is_sm86 else (None, None)
@@ -682,9 +746,12 @@ def inner_check_out_mem_format(output):
 
     # Test whether train and eval modes differ for each module. Use to verify
     # that the ModuleInfo entry flag is correct.
-    @skipIfMps  # the test doesn't work on MPS as double types are not supported
     @modules(module_db, train_eval_mode=TrainEvalMode.train_only)
     def test_if_train_and_eval_modes_differ(self, device, dtype, module_info, training):
+        msg = _get_mps_error_msg(device, dtype, module_info, [])
+        if msg is not None:
+            self.skipTest(msg)
+
         module_cls = module_info.module_cls
         module_inputs = module_info.module_inputs_func(module_info, device=device, dtype=dtype,
                                                        requires_grad=False, training=training)

From 2f336a4ee7bcc70a5d6495dcc5cb659339d2f4a9 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 14 Feb 2023 15:30:28 -0800
Subject: [PATCH 08/29] [CHERRY-PICK] Block uint8 data type for unary and
 binary ops on macOS 12. (#313) (#328)

* Block uint8 data type for unary and binary ops on macOS 12. (#313)

* fixes after cherry-pick

---------

Co-authored-by: Ronian526 <11454459+Ronian526@users.noreply.github.com>
---
 aten/src/ATen/native/mps/operations/BinaryOps.mm | 2 ++
 aten/src/ATen/native/mps/operations/UnaryOps.mm  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
index c730eccfe944..6569e59086fc 100644
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -26,6 +26,8 @@
 void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha,
                     const Tensor& output_, std::string op_name, BinaryOpBlock binaryBlock)
 {
+  TORCH_CHECK(!(!is_macos_13_or_newer() && self.scalar_type() == ScalarType::Byte ),
+              "MPS support binary op with uint8 natively starting from macOS 13.0");
   TORCH_CHECK(!(op_name == "power" && !is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_2_PLUS) &&
               (self.scalar_type() == ScalarType::Long ||
               (other.scalar_type() == ScalarType::Long && (self.scalar_type() != ScalarType::Half && self.scalar_type() != ScalarType::Float)))),
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index a869ff3379aa..0c6e5b06d089 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -16,6 +16,8 @@ bool is_empty_tensor(const Tensor& self) {
 
 void unary_op(const Tensor& self, const Tensor& output, std::string op_name, UnaryOpBlock unaryBlock, is_noop_p is_noop = is_empty_tensor)
 {
+  TORCH_CHECK(!(!is_macos_13_or_newer() && self.scalar_type() == ScalarType::Byte ),
+              "MPS support unary op with uint8 natively starting from macOS 13.0");
   if (!output.is_same_size(self)) {
     output.resize_(self.sizes());
   }

From 108cdc015b11b98f735cc705626fc8ae3c9291b7 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Tue, 14 Feb 2023 20:38:59 -0500
Subject: [PATCH 09/29] Fix test_zero_grad() (#330)

---
 test/test_mps.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index b7907e7ed199..a1fc4d7dd5f7 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -6255,24 +6255,24 @@ def test_zero_grad(self):
         self.assertIsNotNone(module.weight.grad)
         self.assertGreater(module.weight.grad.data.abs().sum(), 0)
         module.zero_grad()
-        self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
+        self.assertIsNone(module.weight.grad)
 
         module.bias.requires_grad = True
         module.zero_grad()
-        self.assertIsNotNone(module.weight.grad)
+        self.assertIsNone(module.weight.grad)
         self.assertIsNone(module.bias.grad)
         module(i).sum().backward()
         self.assertIsNotNone(module.weight.grad)
         self.assertIsNotNone(module.bias.grad)
         self.assertGreater(module.weight.grad.data.abs().sum(), 0)
         self.assertGreater(module.bias.grad.data.abs().sum(), 0)
-        module.zero_grad()
+        module.zero_grad(set_to_none=False)   # Force set to zeros.
         self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_())
         self.assertEqual(module.bias.grad.data, module.bias.data.clone().zero_())
 
-        # Force set to None.
-        module.zero_grad(set_to_none=True)
+        module.zero_grad()
         self.assertIsNone(module.weight.grad)
+        self.assertIsNone(module.bias.grad)
 
     def test_no_grad(self):
         for dtype in [torch.bfloat16, torch.float, torch.double]:

From 8de331505dde414f3a882b0b7feb75056b79368f Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Tue, 14 Feb 2023 22:05:23 -0500
Subject: [PATCH 10/29] Convert output back to ChannelsLast if needed (#325)

---
 aten/src/ATen/native/mps/operations/Pooling.mm | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/mps/operations/Pooling.mm b/aten/src/ATen/native/mps/operations/Pooling.mm
index 2b9272d46759..08727fed8265 100644
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@@ -83,6 +83,7 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
   pool2d_shape_check(input, kH, kW, dH, dW, padH, padW, dilationH, dilationW,
                      nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, memory_format);
 
+  auto output_memory_format = output.suggest_memory_format();
   // the output and indices are 'empty', so we could avoid unnecessary gatherView on empty tensors
   // by simply restriding them (instead of calling the costly Contiguous()).
   if (indices.suggest_memory_format() == MemoryFormat::ChannelsLast) {
@@ -94,8 +95,9 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
       outputSizes.insert(outputSizes.begin(), nbatch);
     }
     output.resize_(outputSizes);
-  } else if (output.suggest_memory_format() == MemoryFormat::ChannelsLast) {
+  } else if (output_memory_format == MemoryFormat::ChannelsLast) {
     output.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
+    output_memory_format = MemoryFormat::Contiguous;
   }
 
   if (output.numel() == 0 || (is_backward_pass && grad_output.numel() == 0)) {
@@ -196,6 +198,10 @@ static void pool2d_template(const Tensor& input, const Tensor& output,
     }
 
     runMPSGraph(mpsStream, cachedGraph->graph(), feeds, results);
+
+    if (output_memory_format != suggested_memory_format) {
+      const_cast<Tensor&>(output) = output.to(suggested_memory_format);
+    }
   }
 }
 
@@ -356,6 +362,8 @@ Tensor mps_max_pool2d_backward(
     const Tensor& output,
     const Tensor& indices) {
 
+  auto indices_memory_format = indices.suggest_memory_format();
+
   mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
     MPSGraph* mpsGraph = cachedGraph.graph();
     NSArray<MPSGraphTensor*>* poolOutputs = [mpsGraph maxPooling2DReturnIndicesWithSourceTensor: cachedGraph.inputTensor
@@ -366,6 +374,10 @@ Tensor mps_max_pool2d_backward(
   };
   mps::pool2d_template(input, output, indices, c10::nullopt, kernel_size, stride,
                        padding, dilation, ceil_mode, false, c10::nullopt, pooling_op_block, "max_pool2d_indices");
+
+  if (indices_memory_format == MemoryFormat::ChannelsLast) {
+    const_cast<Tensor&>(indices) = indices.to(MemoryFormat::ChannelsLast);
+  }
 }
 
 TORCH_IMPL_FUNC(max_pool2d_with_indices_backward_out_mps)(

From 051bc9c4c759d119ce76a9cd3b19b1ea5edee418 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 14 Feb 2023 22:44:43 -0800
Subject: [PATCH 11/29] Fix bilinear backward pass (#331)

* Fix bilinear backward pass

* Remove comment
---
 aten/src/ATen/native/mps/OperationUtils.mm       | 2 +-
 aten/src/ATen/native/mps/operations/ReduceOps.mm | 3 +++
 test/test_mps.py                                 | 1 -
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm
index 978162aed855..4e76c172fb6e 100644
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@@ -265,7 +265,7 @@ void printTensorNDArray(const Tensor& t) {
   id<MTLBuffer> srcBuf = getMTLBufferStorage(src);
   bool sliceViewTensor = canSliceViewTensor(src, mpsShape);
   // a view tensor could be contiguous (e.g., slice ops) or non-contiguous (e.g., transpose())
-  if ((!src.is_contiguous() || (src.is_view() && src.storage_offset() && !sliceViewTensor)) && gatherTensorData) {
+  if ((!src.is_contiguous() || (src.storage_offset() && !sliceViewTensor)) && gatherTensorData) {
      Tensor emptyShell = Tensor();
     // use "_tensor" from Placeholder to retain view's output during its usage in other ops
     _tensor = gatherViewTensor(src, emptyShell);
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index f858714fb82d..a79aeca766d3 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -163,6 +163,9 @@ void reduction_out_mps(
     if (reduction_type == MPSReductionType::PROD) {
       output_t.fill_(1);
     }
+    else if (reduction_type == MPSReductionType::SUM) {
+      output_t.zero_();
+    }
     return;
   }
 
diff --git a/test/test_mps.py b/test/test_mps.py
index a1fc4d7dd5f7..ed83acc1db08 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -10042,7 +10042,6 @@ class TestConsistency(TestCaseMPS):
         'nn.functional.prelu': ['f32'],
         'atanh': ['f32'],
         'div': ['f16'],
-        'nn.functional.bilinear': ['f32'],
         'nn.functional.embedding': ['f16'],
 
         # Unsupported dtype

From 1b09ea22a544719e562e4a5f77e081529853b139 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Tue, 14 Feb 2023 23:02:51 -0800
Subject: [PATCH 12/29] Update macOS 12 blocklist (#323)

* Update macOS 12 blocklist
- move sum, masked.var, mul to low precision list
- unblock them from running

* - mark __rdiv__ failures as accumulate error exceeds atol/rtol
---
 test/test_mps.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index ed83acc1db08..51d1063b9b6d 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -10455,16 +10455,15 @@ class TestConsistency(TestCaseMPS):
         'addr',
 
         # for macOS 12
-        'masked.normalize', 'masked.sum',
+        'masked.normalize', 'masked.sum', 'masked.var',
         'outer',
-        'sum_to_size',
+        'sum_to_size', 'sum',
+        'mul',
     }
 
     BLOCKLIST_MACOS_12 = {
+        # failures because of accumulate error exceeds atol/rtol
         '__rdiv__': [torch.float16],
-        'masked.var': [torch.float16],
-        'sum': [torch.float16],
-        'mul': [torch.float16],
 
         # expected failures
         'nn.functional.interpolatenearest': [torch.float32],

From 8c7df6f68f1fda239c278e256578e43cc3f27fc7 Mon Sep 17 00:00:00 2001
From: jhavukainen <104022140+jhavukainen@users.noreply.github.com>
Date: Tue, 14 Feb 2023 23:42:45 -0800
Subject: [PATCH 13/29] [MPS] Fixes for LSTM. (#319)

- Backward pass has to give explicit bias tensor of zeros if none is passed to the op or the bias gradient will not be calculated.
- Fixed bias tensor mistakenly getting overwritten to zeros
- Fixes crash when lstm op called with has_biases set to false. Change takes into account the changed shape of the input params TensorList depending on the bias flag.

Co-authored-by: Kulin Seth <kulin_seth@apple.com>
---
 aten/src/ATen/native/mps/operations/RnnOps.mm | 91 ++++++++++++-------
 test/test_mps.py                              | 57 ++++++++++++
 2 files changed, 116 insertions(+), 32 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/RnnOps.mm b/aten/src/ATen/native/mps/operations/RnnOps.mm
index d46ce356318e..bee82fcc2480 100644
--- a/aten/src/ATen/native/mps/operations/RnnOps.mm
+++ b/aten/src/ATen/native/mps/operations/RnnOps.mm
@@ -30,10 +30,15 @@
     std::vector<Tensor> biases;
     std::vector<Tensor> recurrent_biases;
     for (size_t i = 0; i < num_layers; i+=1) {
-        kernel_weights.push_back(params[i*4]);
-        recurrent_kernel_weights.push_back(params[i*4+1]);
-        biases.push_back(params[i*4+2]);
-        recurrent_biases.push_back(params[i*4+3]);
+        if (has_biases) {
+            kernel_weights.push_back(params[i*4]);
+            recurrent_kernel_weights.push_back(params[i*4+1]);
+            biases.push_back(params[i*4+2]);
+            recurrent_biases.push_back(params[i*4+3]);
+        } else {
+            kernel_weights.push_back(params[i*2]);
+            recurrent_kernel_weights.push_back(params[i*2+1]);
+        }
     }
 
     struct CachedGraph : public MPSCachedGraph {
@@ -71,8 +76,10 @@
             for (size_t i = 0; i < num_layers; i += 1) {
                 [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))];
                 [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))];
-                [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
-                [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                if(has_biases) {
+                    [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
+                    [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                }
             }
 
             MPSGraphLSTMDescriptor * opDesc = [MPSGraphLSTMDescriptor descriptor];
@@ -109,9 +116,12 @@
             NSMutableArray<MPSGraphTensor*>* outputZStateArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
             NSMutableArray<MPSGraphTensor*>* outputCellStateFwdArray = [[NSMutableArray alloc] initWithCapacity:num_layers];
             for(int i = 0; i < num_layers; i++) {
-                MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
-                                                                    secondaryTensor:recurrentBiasList[i]
-                                                                            name:nil];
+                MPSGraphTensor* biasTensor = nil;
+                if(has_biases) {
+                    biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
+                                                     secondaryTensor:recurrentBiasList[i]
+                                                                name:nil];
+                }
                 outputs = [mpsGraph LSTMWithSourceTensor:inputTensor_
                                         recurrentWeight:recurrentKernelWeightsList[i]
                                             inputWeight:kernelWeightsList[i]
@@ -121,7 +131,6 @@
                                              descriptor:opDesc
                                                    name:nil];
 
-
                 stateTensor_ = [mpsGraph sliceTensor:stateTensor
                                                             dimension:0
                                                             start:i
@@ -196,12 +205,14 @@
       for (size_t i = 0; i < num_layers; i+=1) {
           kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]);
           recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]);
-          bias = Placeholder([biasList objectAtIndex:i], biases[i]);
-          recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
           [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()];
           [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()];
-          [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
-          [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+          if(has_biases) {
+            bias = Placeholder([biasList objectAtIndex:i], biases[i]);
+            recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
+            [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
+            [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+          }
 
       }
       Placeholder selfPlaceholder   = Placeholder(cachedGraph->inputTensors_[0], input);
@@ -250,10 +261,15 @@
     std::vector<Tensor> biases;
     std::vector<Tensor> recurrent_biases;
     for (size_t i = 0; i < num_layers; i+=1) {
-        kernel_weights.push_back(params[i*4]);
-        recurrent_kernel_weights.push_back(params[i*4+1]);
-        biases.push_back(params[i*4+2]);
-        recurrent_biases.push_back(params[i*4+3]);
+        if(has_biases) {
+            kernel_weights.push_back(params[i*4]);
+            recurrent_kernel_weights.push_back(params[i*4+1]);
+            biases.push_back(params[i*4+2]);
+            recurrent_biases.push_back(params[i*4+3]);
+        } else {
+            kernel_weights.push_back(params[i*2]);
+            recurrent_kernel_weights.push_back(params[i*2+1]);
+        }
     }
 
     struct CachedGraph : public MPSCachedGraph {
@@ -296,8 +312,10 @@
                     for (size_t i = 0; i < num_layers; i += 1) {
                         [kernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(kernel_weights[i]))];
                         [recurrentKernelWeightsList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_kernel_weights[i]))];
-                        [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
-                        [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                        if(has_biases) {
+                            [kernelBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(biases[i]))];
+                            [recurrentBiasList addObject:mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()),getMPSShape(recurrent_biases[i]))];
+                        }
                     }
 
                     MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type()), getMPSShape(input));
@@ -349,9 +367,15 @@
                         cellStateFwd = [mpsGraph squeezeTensor:cellStateFwd
                                                     axis:0
                                                     name:nil];
-                        MPSGraphTensor* biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
-                                                                            secondaryTensor:recurrentBiasList[i]
-                                                                            name:nil];
+                        MPSGraphTensor* biasTensor = nil;
+                        if(has_biases) {
+                            biasTensor = [mpsGraph additionWithPrimaryTensor:kernelBiasList[i]
+                                                            secondaryTensor:recurrentBiasList[i]
+                                                            name:nil];
+                        } else {
+                            biasTensor = [mpsGraph constantWithScalar:0.0
+                                                            dataType:inputTensor.dataType];
+                        }
 
                         MPSGraphTensor* stateTensor_ = [mpsGraph sliceTensor:stateTensor
                                                                     dimension:0
@@ -391,7 +415,6 @@
                                                   descriptor: opDesc
                                                         name: nil];
 
-
                         gradientTensor_ = [outputs objectAtIndex:0];
                         [gradOutputArray addObject:[outputs objectAtIndex:0]];
                         [gradRecWeightsArray addObject:[outputs objectAtIndex:1]];
@@ -445,18 +468,20 @@
         for (size_t i = 0; i < num_layers; i+=1) {
             kernelWeight = Placeholder([kernelWeightsList objectAtIndex:i], kernel_weights[i]);
             recurrentKernelWeight = Placeholder([recurrentKernelWeightsList objectAtIndex:i], recurrent_kernel_weights[i]);
-            bias = Placeholder([biasList objectAtIndex:i], biases[i]);
-            recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
             [feeds setObject:kernelWeight.getMPSGraphTensorData() forKey:kernelWeight.getMPSGraphTensor()];
             [feeds setObject:recurrentKernelWeight.getMPSGraphTensorData() forKey:recurrentKernelWeight.getMPSGraphTensor()];
-            [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
-            [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+            if(has_biases) {
+                bias = Placeholder([biasList objectAtIndex:i], biases[i]);
+                recurrentBias = Placeholder([recurrentBiasList objectAtIndex:i], recurrent_biases[i]);
+                [feeds setObject:bias.getMPSGraphTensorData() forKey:bias.getMPSGraphTensor()];
+                [feeds setObject:recurrentBias.getMPSGraphTensorData() forKey:recurrentBias.getMPSGraphTensor()];
+            }
         }
 
         Tensor output = at::empty_like(input);
         Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[0]);
         Tensor grad_weights = at::empty_like(kernel_weights[0]);
-        Tensor grad_bias = at::empty_like(biases[0]);
+        Tensor grad_bias = at::empty((kernel_weights[0].size(0)), kernel_weights[0].options());
         Tensor grad_state = at::empty_like(hx[0]);
         Tensor grad_cell_state = at::empty_like(hx[1]);
         Placeholder outputPlaceholder   = Placeholder(cachedGraph->outputTensors_[0], output);
@@ -482,13 +507,15 @@
             Tensor output = at::empty_like(input);
             Tensor grad_rec_weights = at::empty_like(recurrent_kernel_weights[i]);
             Tensor grad_weights = at::empty_like(kernel_weights[i]);
-            Tensor grad_bias = at::empty_like(biases[i]);
+            Tensor grad_bias = at::empty((kernel_weights[0].size(0)), kernel_weights[0].options());
             Tensor grad_state = at::empty_like(hx[0]);
             Tensor grad_cell_state = at::empty_like(hx[1]);
             weights.push_back(grad_weights);
             weights.push_back(grad_rec_weights);
-            weights.push_back(grad_bias);
-            weights.push_back(grad_bias);
+            if(has_biases) {
+                weights.push_back(grad_bias);
+                weights.push_back(grad_bias);
+            }
             gradOutPlaceholder = Placeholder([gradOutputArray objectAtIndex:i], output);
             gradRecWeightsPlaceholder = Placeholder([gradRecWeightsArray objectAtIndex:i], grad_rec_weights);
             gradWeightsPlaceholder = Placeholder([gradWeightsArray objectAtIndex:i], grad_weights);
diff --git a/test/test_mps.py b/test/test_mps.py
index 51d1063b9b6d..d63b98083f3b 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8588,6 +8588,63 @@ def get_results(device):
         self.assertEqual(cpu_input_grad, mps_input_grad)
         self.assertEqual(cpu_weight_grad, mps_weight_grad)
 
+    def test_RNN_cell_no_broadcasting(self):
+        def test(cell_module, input, hx, input_size, hidden_size):
+            cell = cell_module(input_size, hidden_size, device='mps')
+            self.assertRaises(RuntimeError, lambda: cell(input, hx))
+
+        def test_all(hidden_size, bad_hx, good_hx, input_size, input):
+            test(nn.RNNCell, input, bad_hx, input_size, hidden_size)
+            test(nn.GRUCell, input, bad_hx, input_size, hidden_size)
+            test(nn.LSTMCell, input, (bad_hx, good_hx), input_size, hidden_size)
+            test(nn.LSTMCell, input, (good_hx, bad_hx), input_size, hidden_size)
+
+        hidden_size = 20
+        input_size = 10
+        input = torch.randn(3, input_size, device='mps')
+        bad_hx = torch.randn(1, hidden_size, device='mps')
+        good_hx = torch.randn(3, hidden_size, device='mps')
+
+        # Test hidden/input batch size broadcasting
+        test_all(hidden_size, bad_hx, good_hx, input_size, input)
+
+        # Test hx's hidden_size vs module's hidden_size broadcasting
+        bad_hx = torch.randn(3, 1)
+        test_all(hidden_size, bad_hx, good_hx, input_size, input)
+
+        # Test input's input_size vs module's input_size broadcasting
+        bad_input = torch.randn(3, 1)
+        test_all(hidden_size, good_hx, good_hx, input_size, bad_input)
+
+    def test_LSTM_cell(self):
+        # this is just a smoke test; these modules are implemented through
+        # autograd so no Jacobian test is needed
+        for bias in (True, False):
+            input = torch.randn(3, 10, device='mps')
+            hx = torch.randn(3, 20, device='mps')
+            cx = torch.randn(3, 20, device='mps')
+            lstm = nn.LSTMCell(10, 20, bias=bias, device='mps')
+            for _ in range(6):
+                hx, cx = lstm(input, (hx, cx))
+
+            (hx + cx).sum().backward()
+
+    def test_LSTM_cell_forward_input_size(self):
+        input = torch.randn(3, 11, device='mps')
+        hx = torch.randn(3, 20, device='mps')
+        cx = torch.randn(3, 20, device='mps')
+        lstm = nn.LSTMCell(10, 20, device='mps')
+        self.assertRaises(Exception, lambda: lstm(input, (hx, cx)))
+
+    def test_LSTM_cell_forward_hidden_size(self):
+        input = torch.randn(3, 10, device='mps')
+        hx = torch.randn(3, 21, device='mps')
+        cx = torch.randn(3, 20, device='mps')
+        lstm = nn.LSTMCell(10, 20, device='mps')
+        self.assertRaises(Exception, lambda: lstm(input, (hx, cx)))
+        self.assertRaises(Exception, lambda: lstm(input, (cx, hx)))
+
+
 class TestFallbackWarning(TestCase):
     # TODO: Remove once test_testing.py is running on MPS devices
     def test_no_warning_on_import(self):

From d42f74f70a17d40c3df994468a78800e0e56f9db Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Tue, 14 Feb 2023 23:43:02 -0800
Subject: [PATCH 14/29] Fix nn.functional.conv_transpose2d grad (#312) (#329)

- add _mps_convolution_impl that takes optional shape
- for conv_tranpose2d grad, use the shape from input directly
- remove nn.functional.conv_transpose2d grad from blocklist

Co-authored-by: Ronian526 <11454459+Ronian526@users.noreply.github.com>
---
 .../ATen/native/mps/operations/Convolution.mm | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index 3cd442099f5c..7c0a33d36d04 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -56,14 +56,15 @@ void fill_conv_desc(MPSGraphConvolution2DOpDescriptor* descriptor_,
   descriptor_.groups = groups;
 }
 
-Tensor _mps_convolution(
+Tensor _mps_convolution_impl(
     const Tensor& input_t,
     const Tensor& weight_t,
     const c10::optional<Tensor>& bias_opt,
     IntArrayRef padding,
     IntArrayRef stride,
     IntArrayRef dilation,
-    int64_t groups) {
+    int64_t groups,
+    c10::optional<IntArrayRef> input_shape) {
   TORCH_CHECK(input_t.dim() < 5, "Conv3D is not supported on MPS");
 
   namespace native_mps = at::native::mps;
@@ -83,6 +84,8 @@ Tensor _mps_convolution(
   auto memory_format = input_t.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
   auto output_t = at::empty(
+                    input_shape.has_value() ?
+                    input_shape.value() :
                     conv_output_size(input->sizes(), weight->sizes(),
                                      padding, stride, dilation),
                     input->scalar_type(),
@@ -237,6 +240,17 @@ Tensor _mps_convolution(
   return *output;
 }
 
+Tensor _mps_convolution(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const c10::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups) {
+    return _mps_convolution_impl(input_t, weight_t, bias_opt, padding, stride, dilation, groups, c10::nullopt);
+}
+
 Tensor mps_convolution_backward_input(
     IntArrayRef input_size, const Tensor& grad_output_, const Tensor& weight_,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
@@ -576,10 +590,10 @@ Tensor _mps_convolution_transpose(
 Tensor mps_convolution_transpose_backward_input(
     const Tensor& grad_output_t, const Tensor& weight_t,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups)
+    int64_t groups, IntArrayRef input_shape)
 {
-  return at::_mps_convolution(
-    grad_output_t, weight_t, c10::nullopt, padding, stride, dilation, groups);
+  return _mps_convolution_impl(
+    grad_output_t, weight_t, c10::nullopt, padding, stride, dilation, groups, input_shape);
 }
 
 Tensor mps_convolution_transpose_backward_weight(
@@ -603,7 +617,7 @@ Tensor mps_convolution_transpose_backward_weight(
 
   Tensor grad_input, grad_weight;
   if (output_mask[0]) {
-    grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups);
+    grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, input.sizes());
   }
   if (output_mask[1]) {
     grad_weight = mps_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups);

From 285620362f38582b09bed753f32de3e357c5e467 Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Wed, 15 Feb 2023 15:07:51 -0500
Subject: [PATCH 15/29] Fix the crash in elu_backward() (#333)

Fixes a crash where the inputTensor could go null and cause a crash.
---
 .../ATen/native/mps/operations/Activation.mm  | 45 +++++--------------
 1 file changed, 12 insertions(+), 33 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
index 9e643ebf2939..84c2f8789790 100644
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -1208,8 +1208,7 @@ void elu_variants_out_mps (
   {
     CachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
     MPSGraphTensor *gradOutputTensor_ = nil;
-    MPSGraphTensor *inputTensor_ = nil;
-    MPSGraphTensor *resultTensor_ = nil;
+    MPSGraphTensor *selfOrResultTensor_ = nil;
     MPSGraphTensor *gradInputTensor_ = nil;
   };
 
@@ -1218,7 +1217,7 @@ void elu_variants_out_mps (
   MPSStream* stream = getCurrentMPSStream();
 
   @autoreleasepool {
-    string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output}) + ":" +
+    string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output, self_or_result}) + ":" +
                                                  to_string(alpha.to<double>()) + ":" +
                                                  to_string(scale.to<double>()) + ":" +
                                                  to_string(input_scale.to<double>()) + ":" +
@@ -1235,18 +1234,14 @@ void elu_variants_out_mps (
           newCachedGraph = new CachedGraph(mpsGraph);
 
           MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
-
-          MPSGraphTensor* inputTensor = nil;
-          MPSGraphTensor* resultTensor = nil;
-
+          MPSGraphTensor* selfOrResultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
           MPSGraphTensor* lessThanZeroGradTensor = nil;
 
           if(is_result) {
-            resultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
             MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
                                                                shape:@[@1]
                                                             dataType:getMPSDataType(grad_output.scalar_type())];
-            MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:resultTensor
+            MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:selfOrResultTensor
                                                                         secondaryTensor:alphaTensor
                                                                                    name:nil];
             auto constMul = scale.to<double>() * input_scale.to<double>();
@@ -1258,11 +1253,10 @@ void elu_variants_out_mps (
                                                                           name:nil];
           }
           else {
-            inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
             MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
                                                                     shape:@[@1]
                                                                  dataType:getMPSDataType(grad_output.scalar_type())];
-            MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor
+            MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:selfOrResultTensor
                                                                           secondaryTensor:inputScaleTensor
                                                                                      name:nil];
             MPSGraphTensor* expTensor = [mpsGraph exponentWithTensor:scaledInputTensor
@@ -1282,7 +1276,7 @@ void elu_variants_out_mps (
           MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f
                                                               shape:@[@1]
                                                            dataType:getMPSDataType(grad_output.scalar_type())];
-          MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor
+          MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:selfOrResultTensor
                                                                    secondaryTensor:zeroTensor
                                                                               name:nil];
           MPSGraphTensor* gradTensor = [mpsGraph selectWithPredicateTensor:predicateTensor
@@ -1294,8 +1288,7 @@ void elu_variants_out_mps (
                                                                                  name:nil];
 
           newCachedGraph->gradOutputTensor_ = gradOutputTensor;
-          newCachedGraph->inputTensor_ = inputTensor;
-          newCachedGraph->resultTensor_ = resultTensor;
+          newCachedGraph->selfOrResultTensor_ = selfOrResultTensor;
           newCachedGraph->gradInputTensor_ = gradInputTensor;
         }
         return newCachedGraph;
@@ -1304,28 +1297,14 @@ void elu_variants_out_mps (
     }
 
     Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output, nil, executeGatherOp);
-    Placeholder selfPlaceholder = Placeholder();
-    Placeholder resultPlaceholder = Placeholder();
-    if(is_result)
-      resultPlaceholder = Placeholder(cachedGraph->resultTensor_, self_or_result, nil, executeGatherOp);
-    else
-      selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self_or_result, nil, executeGatherOp);
+    Placeholder selfOrResultPlaceholder = Placeholder(cachedGraph->selfOrResultTensor_, self_or_result, nil, executeGatherOp);
     Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, out.has_storage() ? out : grad_input, nil, false);
 
     // Create dictionary of inputs and outputs
-    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = nil;
-
-    if(is_result)
-      feeds = @{
-        gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-        resultPlaceholder.getMPSGraphTensor() : resultPlaceholder.getMPSGraphTensorData()
-      };
-    else
-      feeds = @{
-        gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
-        selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()
-      };
-
+    NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
+      gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(),
+      selfOrResultPlaceholder.getMPSGraphTensor() : selfOrResultPlaceholder.getMPSGraphTensorData()
+    };
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
       gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()
     };

From 18797b00dedb80df3a8f41474260683bd2a63a23 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Wed, 15 Feb 2023 13:14:39 -0800
Subject: [PATCH 16/29] Fix nn.functional.embedding grad (#335)

- casting the input tensor to float32 and cast back the output tensor
- unblock the test
---
 aten/src/ATen/native/mps/operations/Indexing.mm | 16 ++++++++++++++--
 test/test_mps.py                                |  1 -
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 310cbb7bf937..036f0a242f11 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -886,19 +886,31 @@ Tensor embedding_dense_backward_mps(
 
             MPSGraphTensor* reshapedIndicesTensor = indicesTensor;
 
+            MPSGraphTensor* castGradTensor = incomingGradTensor;
+            MPSDataType dataType = mps::getMPSDataType(grad_.scalar_type());
+            // issue 105486100, scatterNDWithUpdatesTensor produces wrong result for float16
+            if (dataType == MPSDataTypeFloat16) {
+              castGradTensor = [mpsGraph castTensor: incomingGradTensor
+                                             toType: MPSDataTypeFloat32
+                                               name: nil];
+            }
             if (num_indices_dims != 0) {
               reshapedIndicesTensor = [mpsGraph  expandDimsOfTensor: indicesTensor
                                                                axes: @[@-1]
                                                                name: nil];
             }
 
-            auto outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor: incomingGradTensor
+            auto outgoingGradTensor = [mpsGraph scatterNDWithUpdatesTensor: castGradTensor
                                                              indicesTensor: reshapedIndicesTensor
                                                                      shape: native_mps::getMPSShape(IntArrayRef(outgoing_gradient_shape))
                                                            batchDimensions: 0
                                                                       mode: MPSGraphScatterModeAdd
                                                                       name: @"edb"];
-
+            if (dataType == MPSDataTypeFloat16) {
+              outgoingGradTensor = [mpsGraph castTensor: outgoingGradTensor
+                                                 toType: MPSDataTypeFloat16
+                                                   name: nil];
+            }
             newCachedGraph->incomingGradTensor_ = incomingGradTensor;
             newCachedGraph->indicesTensor_ = indicesTensor;
             newCachedGraph->outgoingGradTensor_ = outgoingGradTensor;
diff --git a/test/test_mps.py b/test/test_mps.py
index d63b98083f3b..aea4cfe199b0 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -10099,7 +10099,6 @@ class TestConsistency(TestCaseMPS):
         'nn.functional.prelu': ['f32'],
         'atanh': ['f32'],
         'div': ['f16'],
-        'nn.functional.embedding': ['f16'],
 
         # Unsupported dtype
         'special.ndtr': ['f32'],

From cf06ac5c9c4a8038562a2f7a454126afc19f328a Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 15 Feb 2023 13:17:39 -0800
Subject: [PATCH 17/29] Fix prelu backward (#334)

---
 aten/src/ATen/native/mps/operations/Activation.mm | 2 +-
 test/test_mps.py                                  | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm
index 84c2f8789790..440cde4140f4 100644
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@@ -1819,7 +1819,7 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) {
     using namespace mps;
 
     Tensor grad_input = at::empty_like(self, self.suggest_memory_format());
-    Tensor weight_grad = at::empty_like(weight_, at::MemoryFormat::Contiguous);
+    Tensor weight_grad = at::empty_like(self, at::MemoryFormat::Contiguous);
     if (grad_output.numel() == 0) {
       return std::tuple<Tensor, Tensor>{grad_input, weight_grad};
     }
diff --git a/test/test_mps.py b/test/test_mps.py
index aea4cfe199b0..d6bb86b9db3e 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -10096,7 +10096,6 @@ class TestConsistency(TestCaseMPS):
         'trace': ['f32'],
 
         # Correctness issues
-        'nn.functional.prelu': ['f32'],
         'atanh': ['f32'],
         'div': ['f16'],
 

From c65b8236c47a16f97a13e2aa3590dbeef0252fdd Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Wed, 15 Feb 2023 15:56:12 -0800
Subject: [PATCH 18/29] Reduction cast f16 to f32 only on macOS 12 (#332)

- unblock rdiv float16
---
 aten/src/ATen/native/mps/operations/ReduceOps.mm | 5 ++++-
 test/test_mps.py                                 | 3 ---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index a79aeca766d3..d4112c99f6a8 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -200,7 +200,10 @@ void reduction_out_mps(
              (dtype.value() == kFloat || dtype.value() == kHalf || dtype.value() == kInt)) {
             inputCastDtype = getMPSDataType(dtype.value());
           } else if (input_type != MPSDataTypeInt32   &&
-                     input_type != MPSDataTypeFloat32) {
+                     input_type != MPSDataTypeFloat32 &&
+                     input_type != MPSDataTypeFloat16) {
+            inputCastDtype = MPSDataTypeFloat32;
+          } else if (!is_macos_13_or_newer() && input_type == MPSDataTypeFloat16) {
             inputCastDtype = MPSDataTypeFloat32;
           }
 
diff --git a/test/test_mps.py b/test/test_mps.py
index d6bb86b9db3e..261b19fca9f5 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -10517,9 +10517,6 @@ class TestConsistency(TestCaseMPS):
     }
 
     BLOCKLIST_MACOS_12 = {
-        # failures because of accumulate error exceeds atol/rtol
-        '__rdiv__': [torch.float16],
-
         # expected failures
         'nn.functional.interpolatenearest': [torch.float32],
         'nn.functional.upsample_nearest': [torch.float32],

From 73f706846a1c57e2b26230a0f045f53eef548eb8 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 15 Feb 2023 20:24:37 -0800
Subject: [PATCH 19/29] Remove periodic file (running between PRs) (#336)

---
 .github/workflows/periodic.yml | 284 ---------------------------------
 1 file changed, 284 deletions(-)
 delete mode 100644 .github/workflows/periodic.yml

diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
deleted file mode 100644
index 1c137084a97e..000000000000
--- a/.github/workflows/periodic.yml
+++ /dev/null
@@ -1,284 +0,0 @@
-name: periodic
-
-on:
-  schedule:
-    - cron: 45 0,4,8,12,16,20 * * *
-    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
-  push:
-    tags:
-      - ciflow/periodic/*
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
-  cancel-in-progress: true
-
-jobs:
-  parallelnative-linux-focal-py3_8-gcc7-build:
-    name: parallelnative-linux-focal-py3.8-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: parallelnative-linux-focal-py3.8-gcc7
-      docker-image-name: pytorch-linux-focal-py3.8-gcc7
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-        ]}
-
-  parallelnative-linux-focal-py3_8-gcc7-test:
-    name: parallelnative-linux-focal-py3.8-gcc7
-    uses: ./.github/workflows/_linux-test.yml
-    needs: parallelnative-linux-focal-py3_8-gcc7-build
-    with:
-      build-environment: parallelnative-linux-focal-py3.8-gcc7
-      docker-image: ${{ needs.parallelnative-linux-focal-py3_8-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.parallelnative-linux-focal-py3_8-gcc7-build.outputs.test-matrix }}
-
-  linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build:
-    name: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-test:
-    name: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3-gcc7-slow-gradcheck
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3-gcc7-slow-gradcheck-build.outputs.test-matrix }}
-      timeout-minutes: 300
-
-  linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build:
-    name: cuda11.7-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      cuda-arch-list: '8.6'
-      test-matrix: |
-        { include: [
-          { config: "aot_eager_all", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          # These jobs run too slowly so they must be sharded, unfortunately
-          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-test:
-    name: cuda11.7-py3.10-gcc7-sm86-periodic-dynamo-benchmarks
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-sm86
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
-
-  linux-focal-rocm5_4_2-py3_8-build:
-    name: linux-focal-rocm5.4.2-py3.8
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-rocm5.4.2-py3.8
-      docker-image-name: pytorch-linux-focal-rocm-n-py3
-      test-matrix: |
-        { include: [
-          { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
-          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
-          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
-        ]}
-
-  linux-focal-rocm5_4_2-py3_8-test:
-    name: linux-focal-rocm5.4.2-py3.8
-    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_4_2-py3_8-build
-    with:
-      build-environment: linux-focal-rocm5.4.2-py3.8
-      docker-image: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.test-matrix }}
-    secrets:
-      AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
-      AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
-
-  linux-bionic-cuda11_7-py3_9-gcc7-build:
-    name: linux-bionic-cuda11.7-py3.9-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.9-gcc7
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      test-matrix: |
-        { include: [
-          { config: "multigpu", shard: 1, num_shards: 1, runner: "linux.16xlarge.nvidia.gpu" },
-        ]}
-      build-with-debug: false
-
-  linux-bionic-cuda11_7-py3_9-gcc7-test:
-    name: linux-bionic-cuda11.7-py3.9-gcc7
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3_9-gcc7-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.9-gcc7
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_9-gcc7-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_9-gcc7-build.outputs.test-matrix }}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-debug-build:
-    name: linux-bionic-cuda11.7-py3.10-gcc7-debug
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-debug
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      build-with-debug: true
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_7-py3_10-gcc7-debug-test:
-    name: linux-bionic-cuda11.7-py3.10-gcc7-debug
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_7-py3_10-gcc7-debug-build
-    with:
-      build-environment: linux-bionic-cuda11.7-py3.10-gcc7-debug
-      docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-debug-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-debug-build.outputs.test-matrix }}
-
-  linux-bionic-cuda11_8-py3_8-gcc7-debug-build:
-    name: linux-bionic-cuda11.8-py3.8-gcc7-debug
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-bionic-cuda11.8-py3.8-gcc7-debug
-      docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
-      build-with-debug: true
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-bionic-cuda11_8-py3_8-gcc7-debug-test:
-    name: linux-bionic-cuda11.8-py3.8-gcc7-debug
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-bionic-cuda11_8-py3_8-gcc7-debug-build
-    with:
-      build-environment: linux-bionic-cuda11.8-py3.8-gcc7-debug
-      docker-image: ${{ needs.linux-bionic-cuda11_8-py3_8-gcc7-debug-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-bionic-cuda11_8-py3_8-gcc7-debug-build.outputs.test-matrix }}
-
-  libtorch-linux-bionic-cuda11_8-gcc7-build:
-    name: libtorch-linux-bionic-cuda11.8-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: libtorch-linux-bionic-cuda11.8-gcc7
-      docker-image-name: pytorch-linux-bionic-cuda11.8-cudnn8-py3-gcc7
-      build-generates-artifacts: false
-
-  win-vs2019-cuda11_8-py3-build:
-    name: win-vs2019-cuda11.8-py3
-    uses: ./.github/workflows/_win-build.yml
-    with:
-      build-environment: win-vs2019-cuda11.8-py3
-      cuda-version: "11.8"
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
-        ]}
-
-  win-vs2019-cuda11_8-py3-test:
-    name: win-vs2019-cuda11.8-py3
-    uses: ./.github/workflows/_win-test.yml
-    needs: win-vs2019-cuda11_8-py3-build
-    with:
-      build-environment: win-vs2019-cuda11.8-py3
-      cuda-version: "11.8"
-      test-matrix: ${{ needs.win-vs2019-cuda11_8-py3-build.outputs.test-matrix }}
-
-  libtorch-linux-bionic-cuda11_7-gcc7-build:
-    name: libtorch-linux-bionic-cuda11.7-gcc7
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: libtorch-linux-bionic-cuda11.7-gcc7
-      docker-image-name: pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7
-      build-generates-artifacts: false
-
-  win-vs2019-cuda11_7-py3-build:
-    name: win-vs2019-cuda11.7-py3
-    uses: ./.github/workflows/_win-build.yml
-    with:
-      build-environment: win-vs2019-cuda11.7-py3
-      cuda-version: "11.7"
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 3, runner: "windows.g5.4xlarge.nvidia.gpu" },
-          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge" },
-        ]}
-
-  win-vs2019-cuda11_7-py3-test:
-    name: win-vs2019-cuda11.7-py3
-    uses: ./.github/workflows/_win-test.yml
-    needs: win-vs2019-cuda11_7-py3-build
-    with:
-      build-environment: win-vs2019-cuda11.7-py3
-      cuda-version: "11.7"
-      test-matrix: ${{ needs.win-vs2019-cuda11_7-py3-build.outputs.test-matrix }}
-
-  ios-12-5-1-x86-64-coreml:
-    name: ios-12-5-1-x86-64-coreml
-    uses: ./.github/workflows/_ios-build-test.yml
-    with:
-      build-environment: ios-12-5-1-x86-64-coreml
-      ios-platform: SIMULATOR
-      ios-arch: x86_64
-
-  ios-12-5-1-arm64:
-    name: ios-12-5-1-arm64
-    uses: ./.github/workflows/_ios-build-test.yml
-    with:
-      build-environment: ios-12-5-1-arm64
-      ios-platform: OS
-      ios-arch: arm64
-
-  ios-12-5-1-arm64-coreml:
-    name: ios-12-5-1-arm64-coreml
-    uses: ./.github/workflows/_ios-build-test.yml
-    with:
-      build-environment: ios-12-5-1-arm64-coreml
-      ios-platform: OS
-      ios-arch: arm64
-
-  ios-12-5-1-arm64-custom-ops:
-    name: ios-12-5-1-arm64-custom-ops
-    uses: ./.github/workflows/_ios-build-test.yml
-    with:
-      build-environment: ios-12-5-1-arm64-custom-ops
-      ios-platform: OS
-      ios-arch: arm64
-
-  ios-12-5-1-arm64-metal:
-    name: ios-12-5-1-arm64-metal
-    uses: ./.github/workflows/_ios-build-test.yml
-    with:
-      build-environment: ios-12-5-1-arm64-metal
-      ios-platform: OS
-      ios-arch: arm64
-
-  buck-build-test:
-    name: buck-build-test
-    uses: ./.github/workflows/_buck-build-test.yml

From 1c8f126f1ee183385085dbc17b4bf7ac78db35f8 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 15 Feb 2023 20:36:31 -0800
Subject: [PATCH 20/29] Fix upsample for NHWC output (#337)

* Fix upsample for NHWC output

* Add testcase
---
 aten/src/ATen/native/mps/operations/UpSample.mm | 11 ++++++++++-
 test/test_mps.py                                |  9 +++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/UpSample.mm b/aten/src/ATen/native/mps/operations/UpSample.mm
index 17895e19c7d7..3b781dea08f4 100644
--- a/aten/src/ATen/native/mps/operations/UpSample.mm
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@@ -26,6 +26,11 @@ void upsample_out_template(const Tensor& input,
   } else {
     native::upsample_2d_common_check(input.sizes(), output_size);
   }
+  Tensor out;
+  if (!output.is_contiguous()) {
+    out = at::empty_like(output, MemoryFormat::Contiguous);
+  }
+
   bool centerResults = false;
   MPSGraphResizeMode resizeMode = MPSGraphResizeNearest;
   MPSGraphResizeNearestRoundingMode nearestRoundingMode = MPSGraphResizeNearestRoundingModeFloor;
@@ -199,7 +204,7 @@ void upsample_out_template(const Tensor& input,
     MPSGraphTensorData* sizeTensorData = [[[MPSGraphTensorData alloc] initWithMPSNDArray: sizeNDArray] autorelease];
 
     Placeholder inputPlaceholder  = Placeholder(cachedGraph->inputTensor, input);
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, out.has_storage() ? out : output, nil, false);
 
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds = @{
         inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(),
@@ -209,6 +214,10 @@ void upsample_out_template(const Tensor& input,
         outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()
     };
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+
+    if (out.has_storage()) {
+      output.copy_(out);
+    }
   }
 }
 
diff --git a/test/test_mps.py b/test/test_mps.py
index 261b19fca9f5..47ddaa210ac3 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -4362,9 +4362,9 @@ def helper(shape):
         helper((50, 20, 7, 4))
 
     def test_upsample_nearest2d(self):
-        def helper(N, C, H, W):
+        def helper(N, C, H, W, memory_format):
             inputCPU = torch.arange(N * C * H * W, device='cpu', dtype=torch.float,
-                                    requires_grad=True).reshape(N, C, H, W)
+                                    requires_grad=True).reshape(N, C, H, W).to(memory_format=memory_format)
             inputCPU.retain_grad()
             inputMPS = inputCPU.detach().to('mps').requires_grad_()
 
@@ -4390,8 +4390,9 @@ def helper(N, C, H, W):
 
                     self.assertEqual(inputCPU.grad, inputMPS.grad)
 
-        helper(1, 1, 4, 4)
-        helper(7, 5, 3, 2)
+        for memory_format in [torch.channels_last, torch.contiguous_format]:
+            helper(1, 1, 4, 4, memory_format=memory_format)
+            helper(7, 5, 3, 2, memory_format=memory_format)
 
     def test_upsample_bilinear2d(self):
         def helper(N, C, H, W):

From 42be72a92ba216a22812fd6403e78859fdc37a01 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Wed, 15 Feb 2023 21:27:00 -0800
Subject: [PATCH 21/29] [DOWNSTREAM] Fix build failure on x86 runners (#338)

---
 aten/src/ATen/native/mps/operations/Indexing.mm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm
index 036f0a242f11..8522ac920275 100644
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@@ -892,7 +892,7 @@ Tensor embedding_dense_backward_mps(
             if (dataType == MPSDataTypeFloat16) {
               castGradTensor = [mpsGraph castTensor: incomingGradTensor
                                              toType: MPSDataTypeFloat32
-                                               name: nil];
+                                               name: @"castGradTensor"];
             }
             if (num_indices_dims != 0) {
               reshapedIndicesTensor = [mpsGraph  expandDimsOfTensor: indicesTensor
@@ -909,7 +909,7 @@ Tensor embedding_dense_backward_mps(
             if (dataType == MPSDataTypeFloat16) {
               outgoingGradTensor = [mpsGraph castTensor: outgoingGradTensor
                                                  toType: MPSDataTypeFloat16
-                                                   name: nil];
+                                                   name: @"castGradTensor"];
             }
             newCachedGraph->incomingGradTensor_ = incomingGradTensor;
             newCachedGraph->indicesTensor_ = indicesTensor;

From 6ace5f94a66ca1ea31f10ae50eb87c7dcdc83496 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Thu, 16 Feb 2023 21:37:34 -0800
Subject: [PATCH 22/29] Fix trace op (#340)

- give warnings of converting int64 for reduction ops
- use cast tensor for reduction sum on trace
- unblock trace from running
---
 aten/src/ATen/native/mps/operations/ReduceOps.mm | 4 +++-
 test/test_mps.py                                 | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
index d4112c99f6a8..f47dd910dc23 100644
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -139,6 +139,8 @@ void reduction_out_mps(
   MPSReductionType reduction_type,
   const std::string& func_name) {
 
+  // issue 103641234, reduction ops does not have int64 support
+  TORCH_WARN_ONCE(input_t.scalar_type() != ScalarType::Long, "MPS: no support for int64 reduction ops, casting it to int32");
   IntArrayRef input_shape = input_t.sizes();
 
   if (opt_dim.has_value()) {
@@ -247,7 +249,7 @@ void reduction_out_mps(
                                                                axes:wrappedAxes
                                                                name:nil];
           } else if (reduction_type == MPSReductionType::TRACE) {
-            MPSGraphTensor *bandPartWithTensor = [mpsGraph bandPartWithTensor:inputTensor
+            MPSGraphTensor *bandPartWithTensor = [mpsGraph bandPartWithTensor:castInputTensor
                                                                      numLower:0
                                                                      numUpper:0
                                                                          name:nil];
diff --git a/test/test_mps.py b/test/test_mps.py
index 47ddaa210ac3..556b44362e58 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -10123,7 +10123,6 @@ class TestConsistency(TestCaseMPS):
 
         # Functions with correctness issues
         'nn.functional.feature_alpha_dropoutwith_train': [torch.float32],
-        'trace': [torch.int64],
         'normalnumber_mean': [torch.float16, torch.float32],
         'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'multinomial': [torch.float32],

From c9b8ab7091310c29335b587c562009a1ac147855 Mon Sep 17 00:00:00 2001
From: Ronian526 <11454459+Ronian526@users.noreply.github.com>
Date: Thu, 16 Feb 2023 21:42:16 -0800
Subject: [PATCH 23/29] Update random result list (#339)

* - move nn.functional.feature_alpha_dropoutwith_train, normalnumber_mean, new_empty_strided to expected failures

* - update new_empty_strided

---------

Co-authored-by: Kulin Seth <kulin_seth@apple.com>
---
 test/test_mps.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index 556b44362e58..ea762155f8d3 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -10122,9 +10122,6 @@ class TestConsistency(TestCaseMPS):
         'topk': [torch.int16, torch.int32, torch.int64, torch.uint8],
 
         # Functions with correctness issues
-        'nn.functional.feature_alpha_dropoutwith_train': [torch.float32],
-        'normalnumber_mean': [torch.float16, torch.float32],
-        'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'multinomial': [torch.float32],
 
         # cpu result off, showing random values
@@ -10472,7 +10469,9 @@ class TestConsistency(TestCaseMPS):
         'randint_like': [torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'randn_like': [torch.float16, torch.float32],
         'bernoulli': [torch.float32],
+        'nn.functional.feature_alpha_dropoutwith_train': [torch.float32],
         'normal': [torch.float16, torch.float32, torch.float16, torch.float32],
+        'normalnumber_mean': [torch.float16, torch.float32],
         'nn.functional.alpha_dropout': [torch.float32],
         'nn.functional.dropout': [torch.float32],
         'nn.functional.dropout2d': [torch.float32],
@@ -10481,6 +10480,7 @@ class TestConsistency(TestCaseMPS):
         'new_empty': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'empty_like': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         'empty': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
+        'new_empty_strided': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         # problem 103190467, as_strided_scatter has non-deterministic behavior when the update indices are not unique
         'as_strided_scatter': [torch.bool, torch.float16, torch.float32, torch.int16, torch.int32, torch.int64, torch.uint8],
         # duplicate indices are used in the testcase - undefined behaviour

From d3e414e2b28cb9d0799db7bb20d6917016420d77 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Fri, 17 Feb 2023 09:21:54 -0800
Subject: [PATCH 24/29] Fix convolution crash in backward with weights; remove
 unnecessary contiguous calls (#341)

* Fix convolution crash; remove unnecessary contiguous calls

* Fix lintrunner
---
 .../ATen/native/mps/operations/Convolution.mm | 32 ++-----
 test/test_mps.py                              | 95 +++++++++++++++++--
 2 files changed, 95 insertions(+), 32 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm
index 7c0a33d36d04..4bddbba917f5 100644
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@@ -252,20 +252,17 @@ Tensor _mps_convolution(
 }
 
 Tensor mps_convolution_backward_input(
-    IntArrayRef input_size, const Tensor& grad_output_, const Tensor& weight_,
+    IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
   namespace native_mps = at::native::mps;
   using namespace mps;
   CheckedFrom c = "mps_convolution_backward_input";
-  TensorArg grad_output{ grad_output_, "grad_output", 1 },
-            weight{ weight_, "weight", 2 };
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
+            weight{ weight_t, "weight", 2 };
   checkAllSameType(c, {grad_output, weight});
   checkAllSameGPU(c, {grad_output, weight});
-  auto memory_format = grad_output_.suggest_memory_format();
+  auto memory_format = grad_output_t.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
-  Tensor grad_output_t = grad_output_.contiguous(memory_format);
-  Tensor weight_t = weight_.contiguous(memory_format);
-  MPSShape* weightShape = getMPSShape(weight_);
   auto grad_input_t = at::empty( input_size, grad_output_t.options(), c10::nullopt);
 
   // Avoid "grad_input" when this is being used as transposed convolution
@@ -341,7 +338,7 @@ Tensor mps_convolution_backward_input(
           }
 
           MPSGraphTensor* gradOutputTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(grad_output_t.scalar_type()), gradOutputShape);
-          MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, native_mps::getMPSScalarType(weight_t.scalar_type()), weightShape);
+          MPSGraphTensor* weightTensor = native_mps::mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
 
           MPSGraphTensor *gradOutputTensorTranspose = gradOutputTensor;
           if (is_channels_last && grad_output_t.is_contiguous() && !grad_output_t.is_view()) {
@@ -373,7 +370,7 @@ Tensor mps_convolution_backward_input(
     }
 
     auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape);
-    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t, weightShape);
+    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
     auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);
 
     NSDictionary<MPSGraphTensor *, MPSGraphTensorData *> *feeds = @{
@@ -391,17 +388,14 @@ Tensor mps_convolution_backward_input(
 }
 
 Tensor mps_convolution_backward_weights(
-    IntArrayRef weight_size, const Tensor& grad_output_, const Tensor& input_,
+    IntArrayRef weight_size, const Tensor& grad_output_t, const Tensor& input_t,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool bias_defined) {
   namespace native_mps = at::native::mps;
   using namespace mps;
   CheckedFrom c = "mps_convolution_backward_weights";
-  auto memory_format = input_.suggest_memory_format();
+  auto memory_format = grad_output_t.suggest_memory_format();
   bool is_channels_last = (memory_format == at::MemoryFormat::ChannelsLast);
 
-  auto grad_output_t = grad_output_.to(memory_format);
-  auto input_t = input_.to(memory_format);
-
   MPSShape* gradOutputShape = mps::getMPSShape(grad_output_t, memory_format);
 
   // For uniformity with everything else, although it seems grad_weight
@@ -539,12 +533,9 @@ Tensor mps_convolution_backward_weights(
 }
 
 std::tuple<at::Tensor,at::Tensor,at::Tensor> mps_convolution_backward(
-    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
+    const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
     IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
     std::array<bool,3> output_mask) {
-
-  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
-
   Tensor grad_input, grad_weight, grad_bias;
   if (input.numel() == 0) {
     if (output_mask[0]) {
@@ -609,12 +600,9 @@ Tensor mps_convolution_transpose_backward_weight(
 
 
 std::tuple<Tensor,Tensor> mps_convolution_transpose_backward(
-    const Tensor& input, const Tensor& grad_output_t, const Tensor& weight,
+    const Tensor& input, const Tensor& grad_output, const Tensor& weight,
     IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
     std::array<bool,2> output_mask) {
-
-  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
-
   Tensor grad_input, grad_weight;
   if (output_mask[0]) {
     grad_input = mps_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, input.sizes());
diff --git a/test/test_mps.py b/test/test_mps.py
index ea762155f8d3..3de06f2bd276 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -7424,7 +7424,8 @@ def test_conv_transpose_1d_nn_functional(self):
     def test_conv_backward_1d_channels_last(self):
         def helper(shape, in_channels=1, out_channels=1, kernel_size=3, groups=1):
             # https://github.com/pytorch/pytorch/issues/84511
-            conv_cpu = torch.nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups)
+            conv_cpu = torch.nn.Conv1d(
+                in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups).requires_grad_()
             conv_mps = torch.nn.Conv1d(
                 in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, groups=groups).to("mps")
             conv_mps.weight.data = conv_cpu.weight.data.detach().clone().to("mps").requires_grad_(True)
@@ -7464,15 +7465,89 @@ def test_conv1d_contiguous(self):
 
     def test_conv2d_all_strides_paddings(self):
         # https://github.com/pytorch/pytorch/issues/83180
-        y_cpu = torch.randn(2, 2, 3, 6)
-        y_gpu = y_cpu.to(device='mps')
-        for strideX in range(1, 4):
-            for strideY in range(1, 4):
-                conv_cpu = torch.nn.Conv2d(in_channels=2, out_channels=2, kernel_size=3, stride=(strideX, strideY))
-                conv_gpu = copy.deepcopy(conv_cpu).to(device='mps')
-                x_cpu = conv_cpu(y_cpu)
-                x_gpu = conv_gpu(y_gpu)
-                self.assertEqual(x_cpu, x_gpu.cpu(), rtol=1e-03, atol=1e-05)
+        def helper(N, C, H, W, groups, input_mem_format, weight_mem_format, permute_data):
+            x_cpu = torch.randn(N, C, H, W).to(memory_format=input_mem_format).requires_grad_()
+            x_mps = x_cpu.detach().clone().to(device='mps').requires_grad_()
+
+            if permute_data:
+                x_cpu.permute(0, 2, 3, 1)
+                x_mps.permute(0, 2, 3, 1)
+
+            for strideX in range(1, 4):
+                for strideY in range(1, 4):
+                    conv_cpu = torch.nn.Conv2d(
+                        in_channels=N, out_channels=C, kernel_size=H, groups=groups, stride=(strideX, strideY)).requires_grad_()
+                    conv_cpu.weight.data = conv_cpu.weight.to(memory_format=weight_mem_format).requires_grad_()
+
+                    conv_mps = torch.nn.Conv2d(
+                        in_channels=N, out_channels=C, kernel_size=H, groups=groups, stride=(strideX, strideY), device="mps")
+                    conv_mps.weight.data = conv_cpu.weight.data.detach().clone().to("mps").requires_grad_()
+                    conv_mps.bias.data = conv_cpu.bias.data.detach().clone().to("mps").requires_grad_()
+
+                    res_cpu = conv_cpu(x_cpu)
+                    res_mps = conv_mps(x_mps)
+                    self.assertEqual(res_cpu, res_mps.cpu(), rtol=1e-03, atol=1e-05)
+
+                    res_cpu = res_cpu.sum().backward()
+                    res_mps = res_mps.sum().backward()
+                    self.assertEqual(res_cpu, res_mps, rtol=2.6e-05, atol=2e-04)
+                    self.assertEqual(conv_cpu.weight.grad, conv_mps.weight.grad, rtol=2.6e-05, atol=2e-04)
+                    self.assertEqual(conv_cpu.bias.grad, conv_mps.bias.grad)
+                    self.assertEqual(x_cpu.grad, x_mps.grad)
+
+        for mem_format_input in [torch.contiguous_format, torch.channels_last]:
+            for mem_format_weight in [torch.contiguous_format, torch.channels_last]:
+                for permute_data in [True, False]:
+                    helper(2, 2, 3, 6, 1, mem_format_input, mem_format_weight, permute_data)
+                    helper(10, 10, 4, 6, 2, mem_format_input, mem_format_weight, permute_data)
+                    helper(32, 32, 4, 6, 2, mem_format_input, mem_format_weight, permute_data)
+
+    def test_conv_transpose_2d_strided(self):
+        def helper(m_cpu, memory_format):
+            m_mps = copy.deepcopy(m_cpu).requires_grad_()
+            m_mps.weight.data = m_cpu.weight.data.detach().clone().to("mps").requires_grad_()
+            m_mps.bias.data = m_cpu.bias.data.detach().clone().to("mps").requires_grad_()
+
+            input_cpu = torch.randn(20, 16, 50, 100).to(memory_format=memory_format).requires_grad_()
+            input_mps = input_cpu.detach().clone().to("mps")
+
+            output_cpu = m_cpu(input_cpu)
+            output_mps = m_mps(input_mps)
+            self.assertEqual(output_cpu, output_mps)
+
+        for mem_format_input in [torch.contiguous_format, torch.channels_last]:
+            # With square kernels and equal stride
+            helper(nn.ConvTranspose2d(16, 33, 3, stride=2).requires_grad_(), mem_format_input)
+
+            # non-square kernels and unequal stride and with padding
+            helper(nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2)).requires_grad_(), mem_format_input)
+
+    def test_conv_transpose_2d_specified_output(self):
+        input_cpu = torch.randn(1, 16, 12, 12)
+        input_mps = input_cpu.detach().clone().to("mps")
+
+        downsample_cpu = nn.Conv2d(16, 16, 3, stride=2, padding=1)
+        downsample_mps = nn.Conv2d(16, 16, 3, stride=2, padding=1, device="mps")
+        downsample_mps.weight.data = downsample_cpu.weight.data.detach().clone().to("mps").requires_grad_()
+        downsample_mps.bias.data = downsample_cpu.bias.data.detach().clone().to("mps").requires_grad_()
+
+        upsample_cpu = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1)
+        upsample_mps = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1, device="mps")
+        upsample_mps.weight.data = upsample_cpu.weight.data.detach().clone().to("mps").requires_grad_()
+        upsample_mps.bias.data = upsample_cpu.bias.data.detach().clone().to("mps").requires_grad_()
+
+        h_cpu = downsample_cpu(input_cpu)
+        h_mps = downsample_mps(input_mps)
+        self.assertEqual(h_cpu, h_mps)
+
+        size_cpu = h_cpu.size()
+        size_mps = h_mps.size()
+        self.assertEqual(size_cpu, size_mps)
+
+        output_cpu = upsample_cpu(h_cpu, output_size=input_cpu.size())
+        output_mps = upsample_mps(h_mps, output_size=input_mps.size())
+        self.assertEqual(output_cpu, output_mps)
+        self.assertEqual(output_cpu.size(), output_mps.size())
 
     def test_conv2d_single_stride(self):
         y_cpu = torch.randn(2, 2, 3, 6)

From be8817bc2669b0261a77c25a397962943290860b Mon Sep 17 00:00:00 2001
From: Ramin Azarmehr <razarmehr@apple.com>
Date: Fri, 17 Feb 2023 15:42:50 -0500
Subject: [PATCH 25/29] Fix copy_cast_mps() on tensors with storage offset
 (#343)

This should fix the failure with GPT2 when use_cache=True
---
 aten/src/ATen/native/mps/operations/Copy.mm | 7 +++++--
 test/test_mps.py                            | 9 +++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
index e4c673145ada..94527cfd373f 100644
--- a/aten/src/ATen/native/mps/operations/Copy.mm
+++ b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -251,8 +251,11 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
   bool returnGatherOutput = dst_.is_contiguous();
   Tensor src;
   auto sameMemFormat = src_.is_contiguous(dst_.suggest_memory_format()) && dst_.is_contiguous(dst_.suggest_memory_format());
+  const bool sameDataType = src_.dtype() == dst_.dtype();
 
-  if (!src_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) {
+  if ((!src_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) ||
+      // the copy_cast path requires storage_offset to be applied before casting
+      (src_.storage_offset() && !sameDataType)) {
     Tensor emptyShell = Tensor();
     src = gatherViewTensor(src_, returnGatherOutput ? dst_ : emptyShell);
 
@@ -282,7 +285,7 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
   src._set_neg(src_.is_neg());
 
   const size_t src_size = src.nbytes();
-  if (src.dtype() == dst_.dtype()) {
+  if (sameDataType) {
     MPSStream* stream = getCurrentMPSStream();
     // for GPU to GPU copies we only encode to stream's command buffer (no flushing)
     stream->copy(sourceBuffer, destBuffer, src_size, src_byte_offset, dst_byte_offset);
diff --git a/test/test_mps.py b/test/test_mps.py
index 3de06f2bd276..bcaaf2a6bfc4 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1799,6 +1799,15 @@ def test_slice_reshape(self):
         x_cpu = x_cpu + 2
         self.assertEqual(x, x_cpu)
 
+    def test_slice_casting(self):
+        # generate random binary numbers
+        cpu_in = torch.bernoulli(torch.empty(1, 1, 128, 128).uniform_(0, 1)).to(torch.uint8)
+        mps_in = cpu_in.detach().clone().to("mps")
+        # check copy_cast(unit8 -> bool) on tensors with storage offset
+        cpu_out = cpu_in[:, :, 11 : 12, :12].to(torch.bool)
+        mps_out = mps_in[:, :, 11 : 12, :12].to(torch.bool)
+        self.assertEqual(cpu_out, mps_out)
+
     def test_slice_reshape_contg_view(self):
         import torch
 

From 8e371167f84eb5acf508993f0eed5fcff2291d80 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <104024078+DenisVieriu97@users.noreply.github.com>
Date: Sat, 18 Feb 2023 10:27:48 -0800
Subject: [PATCH 26/29] Enable int8 in TestConsistency (#347)

---
 test/test_mps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_mps.py b/test/test_mps.py
index bcaaf2a6bfc4..8ddaaf42f765 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -8875,7 +8875,7 @@ def test_serialization_map_location(self):
 
 
 MPS_DTYPES = get_all_dtypes()
-for t in [torch.double, torch.cdouble, torch.cfloat, torch.int8, torch.bfloat16]:
+for t in [torch.double, torch.cdouble, torch.cfloat, torch.bfloat16]:
     del MPS_DTYPES[MPS_DTYPES.index(t)]
 
 abbrs_to_torch_dtype_dict = {value : key for (key, value) in dtype_abbrs.items()}

From 00135b15b650319c8ca96a1c85ebd7abd96a5500 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <dvieriu@apple.com>
Date: Mon, 20 Feb 2023 15:52:13 -0800
Subject: [PATCH 27/29] Enable test/nn/convolution on MPS

---
 test/nn/test_convolution.py                   | 641 +++++++++---------
 torch/testing/_internal/common_device_type.py |   6 +
 2 files changed, 344 insertions(+), 303 deletions(-)

diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index 5413513b3861..9f93bdce91f4 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -12,16 +12,16 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.testing._internal.common_dtype import floating_types_and, floating_and_complex_types_and
-from torch.testing._internal.common_utils import run_tests, \
+from torch.testing._internal.common_utils import TEST_WITH_MPS, run_tests, \
     skipIfRocmVersionLessThan, TEST_SCIPY, TEST_WITH_ROCM, \
     download_file, parametrize as parametrize_test, subtest, \
     instantiate_parametrized_tests, set_default_dtype
 from torch.testing._internal.common_cuda import TEST_CUDA, TEST_CUDNN
 from torch.testing._internal.common_nn import NNTestCase, _test_module_empty_input
-from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, \
+from torch.testing._internal.common_device_type import dtypesIfMPS, instantiate_device_type_tests, dtypes, \
     dtypesIfCUDA, precisionOverride, skipCUDAIfNoCudnn, skipCUDAIfCudnnVersionLessThan, onlyCUDA, onlyCPU, \
     skipCUDAIfRocm, skipCUDAIfRocmVersionLessThan, \
-    onlyNativeDeviceTypes, largeTensorTest, skipMeta, \
+    onlyNativeDeviceTypes, largeTensorTest, skipMPSIf, skipMeta, \
     disableMkldnn, skipCPUIfNoMkldnn, disablecuDNN, skipCUDAIfMiopen, skipCUDAIfNoMiopen
 
 from torch.testing import make_tensor
@@ -41,7 +41,7 @@ class TestConvolutionNN(NNTestCase):
     _do_cuda_memory_leak_check = True
     _do_cuda_non_default_stream = True
 
-    def test_conv_backcompat(self):
+    def test_conv_backcompat(self, device):
         from torch.serialization import SourceChangeWarning
 
         # This file was generated by running on PyTorch 1.0.1 on Python 2:
@@ -55,112 +55,117 @@ def test_conv_backcompat(self):
         path = download_file('https://download.pytorch.org/test_data/legacy_conv2d.pt')
         with warnings.catch_warnings():
             warnings.simplefilter('ignore', SourceChangeWarning)
-            m = torch.load(path, encoding='utf-8')
-        input = torch.randn((1, 1, 1, 1), dtype=torch.float)
+            m = torch.load(path, encoding='utf-8').to(device)
+        input = torch.randn((1, 1, 1, 1), dtype=torch.float, device=device)
         self.assertEqual(m(input).size(), (1, 1, 1, 1))
 
-    def test_invalid_conv1d(self):
-        for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
-            module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True).to(dtype)
-            input = torch.randn(1, 3, 4).to(dtype)
-            with self.assertRaisesRegex(RuntimeError,
-                                        r'Calculated padded input size per channel: \(4\). ' +
-                                        r'Kernel size: \(10\). Kernel size can\'t be greater than actual input size'):
-                module(input)
-
-            # Negative stride check
-            module = nn.Conv1d(in_channels=3, out_channels=6, kernel_size=3, stride=-1, bias=True).to(dtype)
-            input = torch.randn(1, 3, 4).to(dtype)
-            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
-                module(input)
-
-    def test_mismatch_shape_conv2d(self):
-        for dtype in (torch.float, torch.cfloat):
-            x = torch.randn(1, 10, 1, 28, 28, dtype=dtype)
-            w = torch.randn(6, 1, 5, 5, dtype=dtype)
-
-            with self.assertRaisesRegex(RuntimeError,
-                                        r'Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d, but got ' +
-                                        r'input of size: \[1, 10, 1, 28, 28\]'):
-
-                F.conv2d(x, w)
 
-    def test_conv2d_discontiguous_weight(self):
-        for dtype in (torch.float, torch.cfloat):
-            # Test for https://github.com/pytorch/pytorch/issues/55781
-            x = torch.ones(64, 16, 16, 16, dtype=dtype)
-            weight = torch.arange(0, 1.0, 1 / 2.0 ** 10).reshape(32, 16, 1, 2).to(dtype)[:, :, :, ::2]
-            self.assertFalse(weight.is_contiguous())
-            y = torch.nn.functional.conv2d(x, weight, None)
-            if torch.backends.mkldnn.is_available():
-                # Disable MKLDNN explicitly, so that either NNPACK or THCNN will be used
-                with torch.backends.mkldnn.flags(enabled=False):
-                    y_ = torch.nn.functional.conv2d(x, weight, None)
-                    self.assertEqual(y, y_)
-            self.assertEqual(y.sum(), 4186112.)
-
-    def test_invalid_conv2d(self):
-        for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
-            module = torch.nn.Conv2d(1, 1, kernel_size=3, dilation=2, stride=2).to(dtype)
-            input = torch.empty(1, 1, 4, 4).to(dtype)
-            self.assertRaises(RuntimeError, lambda: module(input))
-
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True)
-            input = torch.randn(1, 3, 1, 1)
-            with self.assertRaisesRegex(RuntimeError,
-                                        r'Calculated padded input size per channel: \(1 x 1\). ' +
-                                        r'Kernel size: \(10 x 10\). Kernel size can\'t be greater than actual input size'):
-                module(input)
+    @dtypes(*floating_and_complex_types_and(torch.half, torch.bfloat16))
+    @dtypesIfMPS(torch.half, torch.float)
+    def test_invalid_conv1d(self, device, dtype):
+        module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True, device=device).to(dtype)
+        input = torch.randn(1, 3, 4).to(dtype)
+        with self.assertRaisesRegex(RuntimeError,
+                                    r'Calculated padded input size per channel: \(4\). ' +
+                                    r'Kernel size: \(10\). Kernel size can\'t be greater than actual input size'):
+            module(input)
 
-            # Negative stride check
-            module = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=4, stride=-1, bias=True).to(dtype)
-            input = torch.randn(1, 3, 4, 4).to(dtype)
-            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
-                module(input)
+        # Negative stride check
+        module = nn.Conv1d(in_channels=3, out_channels=6, kernel_size=3, stride=-1, bias=True).to(dtype)
+        input = torch.randn(1, 3, 4).to(dtype)
+        with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
+            module(input)
 
-            # Zero stride check
-            module = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=4, stride=0, bias=True).to(dtype)
-            input = torch.randn(1, 3, 4, 4).to(dtype)
-            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
-                module(input)
-
-    def test_invalid_conv3d(self):
-        for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
-            module = torch.nn.Conv3d(1, 1, kernel_size=3, dilation=2, stride=2).to(dtype)
-            input = torch.empty(1, 1, 4, 4, 4).to(dtype)
-            self.assertRaises(RuntimeError, lambda: module(input))
+    @dtypes(torch.float, torch.cfloat)
+    @dtypesIfMPS(torch.float)
+    def test_mismatch_shape_conv2d(self, device, dtype):
+        x = torch.randn(1, 10, 1, 28, 28, dtype=dtype, device=device)
+        w = torch.randn(6, 1, 5, 5, dtype=dtype, device=device)
 
-            # Negative stride check
-            module = torch.nn.Conv3d(1, 1, kernel_size=3, stride=-2)
-            input = torch.empty(1, 1, 4, 4, 4)
-            with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
-                module(input)
+        with self.assertRaisesRegex(RuntimeError,
+                                    r'Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d, but got ' +
+                                    r'input of size: \[1, 10, 1, 28, 28\]'):
+            F.conv2d(x, w)
 
-    def test_conv_invalid_groups(self):
+    @dtypes(torch.float, torch.cfloat)
+    @dtypesIfMPS(torch.float)
+    def test_conv2d_discontiguous_weight(self, device, dtype):
+        # Test for https://github.com/pytorch/pytorch/issues/55781
+        x = torch.ones(64, 16, 16, 16, dtype=dtype, device=device)
+        weight = torch.arange(0, 1.0, 1 / 2.0 ** 10, device=device).reshape(32, 16, 1, 2).to(dtype)[:, :, :, ::2]
+        self.assertFalse(weight.is_contiguous())
+        y = torch.nn.functional.conv2d(x, weight, None).to(device)
+        if torch.backends.mkldnn.is_available():
+            # Disable MKLDNN explicitly, so that either NNPACK or THCNN will be used
+            with torch.backends.mkldnn.flags(enabled=False):
+                y_ = torch.nn.functional.conv2d(x, weight, None)
+                self.assertEqual(y, y_)
+        self.assertEqual(y.sum(), 4186112.)
+
+    @dtypes(*floating_and_complex_types_and(torch.bfloat16))
+    @dtypesIfMPS(torch.float)
+    def test_invalid_conv2d(self, device, dtype):
+        module = torch.nn.Conv2d(1, 1, kernel_size=3, dilation=2, stride=2, device=device).to(dtype)
+        input = torch.empty(1, 1, 4, 4, device=device).to(dtype)
+        self.assertRaises(RuntimeError, lambda: module(input))
+
+        module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, stride=1, bias=True, device=device)
+        input = torch.randn(1, 3, 1, 1, device=device)
+        with self.assertRaisesRegex(RuntimeError,
+                                    r'Calculated padded input size per channel: \(1 x 1\). ' +
+                                    r'Kernel size: \(10 x 10\). Kernel size can\'t be greater than actual input size'):
+            module(input)
+
+        # Negative stride check
+        module = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=4, stride=-1, bias=True, device=device).to(dtype)
+        input = torch.randn(1, 3, 4, 4, device=device).to(dtype)
+        with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
+            module(input)
+
+        # Zero stride check
+        module = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=4, stride=0, bias=True, device=device).to(dtype)
+        input = torch.randn(1, 3, 4, 4, device=device).to(dtype)
+        with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
+            module(input)
+
+    @dtypes(*floating_and_complex_types_and(torch.bfloat16))
+    @dtypesIfMPS(torch.float)
+    def test_invalid_conv3d(self, device, dtype):
+        module = torch.nn.Conv3d(1, 1, kernel_size=3, dilation=2, stride=2, device=device).to(dtype)
+        input = torch.empty(1, 1, 4, 4, 4, device=device).to(dtype)
+        self.assertRaises(RuntimeError, lambda: module(input))
+
+        # Negative stride check
+        module = torch.nn.Conv3d(1, 1, kernel_size=3, stride=-2, device=device)
+        input = torch.empty(1, 1, 4, 4, 4, device=device)
+        with self.assertRaisesRegex(RuntimeError, 'non-positive stride is not supported'):
+            module(input)
+
+    def test_conv_invalid_groups(self, device):
         with self.assertRaisesRegex(ValueError, 'groups must be a positive integer'):
-            torch.nn.Conv1d(1, 1, kernel_size=3, dilation=2, stride=2, groups=0)
+            torch.nn.Conv1d(1, 1, kernel_size=3, dilation=2, stride=2, groups=0, device=device)
         with self.assertRaisesRegex(ValueError, 'groups must be a positive integer'):
-            torch.nn.Conv2d(1, 1, kernel_size=3, dilation=2, stride=2, groups=-1)
+            torch.nn.Conv2d(1, 1, kernel_size=3, dilation=2, stride=2, groups=-1, device=device)
         with self.assertRaisesRegex(ValueError, 'groups must be a positive integer'):
-            torch.nn.Conv3d(1, 1, kernel_size=3, dilation=2, stride=2, groups=-2)
+            torch.nn.Conv3d(1, 1, kernel_size=3, dilation=2, stride=2, groups=-2, device=device)
 
-    def test_Conv1d_module_same_padding(self):
+    def test_Conv1d_module_same_padding(self, device):
         # Compare module against functional: without strides/dilation, asymmetric padding
-        x = torch.rand(1, 1, 20)
+        x = torch.rand(1, 1, 20, device=device)
         module = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=10,
-                           padding='same')
+                           padding='same', device=device)
         expect = F.conv1d(x, module.weight, module.bias, padding='same')
         self.assertEqual(expect, module(x))
 
         # Test dilation, symmetric padding
         module = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=10,
-                           padding='same', dilation=2)
+                           padding='same', dilation=2, device=device)
         expect = F.conv1d(x, module.weight, module.bias, padding='same', dilation=2)
         self.assertEqual(expect, module(x))
 
         # Test non-zero padding_mode, requiring explicit padding
         module = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=10,
-                           padding='same', padding_mode='replicate')
+                           padding='same', padding_mode='replicate', device=device)
         x_padded = F.pad(x, [4, 5], mode='replicate')
         expect = F.conv1d(x_padded, module.weight, module.bias, padding='valid')
         self.assertEqual(expect, module(x))
@@ -168,30 +173,30 @@ def test_Conv1d_module_same_padding(self):
 
         # Test connstruction with invalid padding string raises
         with self.assertRaisesRegex(ValueError, 'Invalid padding string'):
-            module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, padding='foo')
+            module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, padding='foo', device=device)
 
         # Test connstruction with same padding and strides raises
         with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=2)
+            module = nn.Conv1d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=2, device=device)
 
-    def test_Conv2d_module_same_padding(self):
+    def test_Conv2d_module_same_padding(self, device):
         # Compare module against functional:
         # without strides/dilation, both symmetric and asymmetric padding
-        x = torch.rand(1, 1, 9, 20)
+        x = torch.rand(1, 1, 9, 20, device=device)
         module = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(5, 10),
-                           padding='same')
+                           padding='same', device=device)
         expect = F.conv2d(x, module.weight, module.bias, padding='same')
         self.assertEqual(expect, module(x))
 
         # with dilation, symmetric padding
         module = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(3, 4),
-                           padding='same', dilation=(1, 2))
+                           padding='same', dilation=(1, 2), device=device)
         expect = F.conv2d(x, module.weight, module.bias, padding='same', dilation=(1, 2))
         self.assertEqual(expect, module(x))
 
         # Test non-zero padding_mode, requiring explicit padding
         module = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(3, 4),
-                           padding='same', padding_mode='reflect')
+                           padding='same', padding_mode='reflect', device=device)
         x_padded = F.pad(x, [1, 2, 1, 1], mode='reflect')
         expect = F.conv2d(x_padded, module.weight, module.bias, padding='valid')
         self.assertEqual(expect, module(x))
@@ -199,7 +204,7 @@ def test_Conv2d_module_same_padding(self):
 
         # Test connstruction with invalid padding string raises
         with self.assertRaisesRegex(ValueError, 'Invalid padding string'):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='foo')
+            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='foo', device=device)
 
         # Test connstruction with same padding and strides raises
         with self.assertRaisesRegex(ValueError, "padding='same'"):
@@ -209,24 +214,25 @@ def test_Conv2d_module_same_padding(self):
         with self.assertRaisesRegex(ValueError, "padding='same'"):
             module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(4, 1))
 
-    def test_Conv3d_module_same_padding(self):
+    @skipMPSIf(True, "Conv3D is not supported on MPS")
+    def test_Conv3d_module_same_padding(self, device):
         # Compare module against functional:
-        x = torch.rand(1, 1, 4, 4, 4)
+        x = torch.rand(1, 1, 4, 4, 4, device=device)
         # without dilation, both symmetric and asymmetric padding
         module = nn.Conv3d(in_channels=1, out_channels=1, kernel_size=(2, 3, 4),
-                           padding='same')
+                           padding='same', device=device)
         expect = F.conv3d(x, module.weight, module.bias, padding='same')
         self.assertEqual(expect, module(x))
 
         # with dilation, both symmetric and asymmetric padding
         module = nn.Conv3d(in_channels=1, out_channels=1, kernel_size=(2, 3, 4),
-                           padding='same', dilation=(3, 2, 1))
+                           padding='same', dilation=(3, 2, 1),  device=device)
         expect = F.conv3d(x, module.weight, module.bias, padding='same', dilation=(3, 2, 1))
         self.assertEqual(expect, module(x))
 
         # Test non-zero padding_mode, requiring explicit padding
         module = nn.Conv3d(in_channels=1, out_channels=1, kernel_size=(2, 3, 4),
-                           padding='same', padding_mode='circular')
+                           padding='same', padding_mode='circular', device=device)
         x_padded = F.pad(x, [1, 2, 1, 1, 0, 1], mode='circular')
         expect = F.conv3d(x_padded, module.weight, module.bias, padding='valid')
         self.assertEqual(expect, module(x))
@@ -234,17 +240,17 @@ def test_Conv3d_module_same_padding(self):
 
         # Test connstruction with invalid padding string raises
         with self.assertRaisesRegex(ValueError, 'Invalid padding string'):
-            module = nn.Conv3d(in_channels=3, out_channels=33, kernel_size=10, padding='foo')
+            module = nn.Conv3d(in_channels=3, out_channels=33, kernel_size=10, padding='foo', device=device)
 
         # Test connstruction with same padding and strides raises
         with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=2)
+            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=2, device=device)
         with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(1, 1, 3))
+            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(1, 1, 3), device=device)
         with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(1, 4, 1))
+            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(1, 4, 1), device=device)
         with self.assertRaisesRegex(ValueError, "padding='same'"):
-            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(5, 1, 1))
+            module = nn.Conv2d(in_channels=3, out_channels=33, kernel_size=10, padding='same', stride=(5, 1, 1), device=device)
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
     def test_thnn_conv_strided_padded_dilated(self):
@@ -275,9 +281,11 @@ def test_thnn_conv_strided_padded_dilated(self):
                         (inputs.cpu(), weight.cpu(), bias.cpu())
                     )
 
-    def test_Conv2d_inconsistent_types(self):
-        inputs = torch.randn(4, 1, 7, 7, dtype=torch.float)
-        weights = torch.randn(1, 1, 3, 3, dtype=torch.double)
+    @dtypes(torch.double)
+    @dtypesIfMPS(torch.half)
+    def test_Conv2d_inconsistent_types(self, device, dtype):
+        inputs = torch.randn(4, 1, 7, 7, dtype=torch.float, device=device)
+        weights = torch.randn(1, 1, 3, 3, dtype=dtype, device=device)
         # inconsistent types should raise an exception
         self.assertRaises(RuntimeError, lambda: nn.functional.conv2d(inputs, weights))
         # but it should work with the same type
@@ -297,11 +305,13 @@ def test_Conv2d_inconsistent_types_on_GPU_without_cudnn(self):
             # but it should work with the same type
             nn.functional.conv2d(inputs.float(), weights.float(), bias.float())
 
-    def test_Conv2d_1x1(self):
+    @dtypes(torch.double)
+    @skipMPSIf(True, "MPS: No support for double dtype")
+    def test_Conv2d_1x1(self, device, dtype):
         in_channels = 2
         out_channels = 2
-        mod = torch.nn.Conv2d(2, 2, 1, bias=False).to(dtype=torch.double)
-        input = torch.randn(1, in_channels, 5, 5, requires_grad=True, dtype=torch.double)
+        mod = torch.nn.Conv2d(2, 2, 1, bias=False, device=device).to(dtype=dtype)
+        input = torch.randn(1, in_channels, 5, 5, device=device, requires_grad=True, dtype=dtype)
         for enabled in (False, True):
             with torch.backends.mkldnn.flags(enabled=enabled):
                 gradcheck(F.conv2d, (input, mod.weight))
@@ -338,16 +348,14 @@ def run_once(group_val=24, dilation=1):
 
                 self.assertEqual(without_onednn, with_onednn)
 
-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
-    @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
-    def test_cudnn_non_contiguous(self):
-        x = torch.randn(192, 16, 50).cuda()
+    def test_cudnn_non_contiguous(self, device):
+        x = torch.randn(192, 16, 50).to(device)
         x = x.permute(0, 2, 1).contiguous().permute(0, 2, 1)
         m = torch.nn.Conv1d(
             in_channels=16,
             out_channels=32,
             kernel_size=2,
-            bias=True).cuda()
+            bias=True).to(device)
         result = m(x)
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
@@ -365,64 +373,66 @@ def test_Conv2d_inconsistent_types_on_GPU_with_cudnn(self):
             # but it should work with the same type
             nn.functional.conv2d(inputs.float(), weights.float(), bias.float())
 
-    def test_Conv2d_missing_argument(self):
-        c = nn.Conv2d(3, 3, 3)
+    def test_Conv2d_missing_argument(self, device):
+        c = nn.Conv2d(3, 3, 3, device=device)
         self.assertRaises(TypeError, lambda: c(None))
 
-    def test_Conv2d_backward_twice(self):
-        input = torch.randn(2, 3, 5, 5)
-        c = nn.Conv2d(3, 3, 3)
+    def test_Conv2d_backward_twice(self, device):
+        input = torch.randn(2, 3, 5, 5, device=device)
+        c = nn.Conv2d(3, 3, 3, device=device)
         o1 = c(input)
         o1.sum().backward()
         self.assertRaisesRegex(RuntimeError, 'Specify retain_graph=True',
                                lambda: o1.sum().backward())
 
+    @dtypes(*floating_types_and(torch.bfloat16))
+    @dtypesIfMPS(torch.float)
+    def test_conv_modules_raise_error_on_incorrect_input_size(self, device, dtype):
+        modules = [nn.Conv1d(3, 8, 3, device=device).to(dtype), nn.ConvTranspose1d(3, 8, 3, device=device).to(dtype),
+                    nn.Conv2d(3, 8, 3, device=device).to(dtype), nn.ConvTranspose2d(3, 8, 3, device=device).to(dtype),
+                    nn.Conv3d(3, 8, 3, device=device).to(dtype), nn.ConvTranspose3d(3, 8, 3, device=device).to(dtype)]
 
-    def test_conv_modules_raise_error_on_incorrect_input_size(self):
-        for dtype in [torch.bfloat16, torch.double, torch.float]:
-            modules = [nn.Conv1d(3, 8, 3).to(dtype), nn.ConvTranspose1d(3, 8, 3).to(dtype),
-                       nn.Conv2d(3, 8, 3).to(dtype), nn.ConvTranspose2d(3, 8, 3).to(dtype),
-                       nn.Conv3d(3, 8, 3).to(dtype), nn.ConvTranspose3d(3, 8, 3).to(dtype)]
+        invalid_input_dims = [(1, 4), (1, 4),
+                                (2, 5), (2, 5),
+                                (3, 6), (3, 6)]
 
-            invalid_input_dims = [(1, 4), (1, 4),
-                                  (2, 5), (2, 5),
-                                  (3, 6), (3, 6)]
-
-            for invalid_dims, module in zip(invalid_input_dims, modules):
-                for dims in invalid_dims:
-                    input = torch.empty(torch.Size((3, ) * dims))
-                    self.assertRaises(RuntimeError, lambda: module(input))
+        for invalid_dims, module in zip(invalid_input_dims, modules):
+            for dims in invalid_dims:
+                input = torch.empty(torch.Size((3, ) * dims), device=device)
+                self.assertRaises(RuntimeError, lambda: module(input))
 
-    def test_conv_shapecheck(self):
+    @dtypes(*floating_and_complex_types_and(torch.bfloat16))
+    @dtypesIfMPS(torch.float)
+    @skipMPSIf(True, "Conv3D is not supported on MPS")
+    def test_conv_shapecheck(self, device, dtype):
         def test(should_raise, module, input_size, dtype):
-            input = torch.empty(3, *input_size).to(dtype)
+            input = torch.empty(3, *input_size, device=device).to(dtype)
             if should_raise:
                 self.assertRaises(RuntimeError, lambda: module(input))
             else:
                 # just run it to ensure no exception raised.
                 module(input)
 
-        for dtype in [torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble]:
-            # Conv1d
-            test(True, nn.Conv1d(1, 1, 3).to(dtype), (1, 2), dtype)
-            test(True, nn.Conv1d(1, 1, 3, stride=2).to(dtype), (1, 2), dtype)
-            test(False, nn.Conv1d(1, 1, 2).to(dtype), (1, 2), dtype)
-            test(False, nn.Conv1d(1, 1, 2, stride=2).to(dtype), (1, 2), dtype)
-            test(False, nn.Conv1d(1, 1, 3, stride=2, padding=1).to(dtype), (1, 2), dtype)
-
-            # Conv2d
-            test(True, nn.Conv2d(1, 1, (3, 3)).to(dtype), (1, 2, 2), dtype)
-            test(False, nn.Conv2d(1, 1, (3, 3)).to(dtype), (1, 3, 3), dtype)
-            test(False, nn.Conv2d(1, 1, (3, 3), padding=1).to(dtype), (1, 2, 2), dtype)
-
-            # Conv3D
-            test(True, nn.Conv3d(1, 1, (3, 3, 3)).to(dtype), (1, 2, 2, 2), dtype)
-            test(False, nn.Conv3d(1, 1, (3, 3, 3)).to(dtype), (1, 3, 3, 3), dtype)
-            test(False, nn.Conv3d(1, 1, (3, 3, 3), padding=1).to(dtype), (1, 2, 2, 2), dtype)
-
-    def test_ConvTranspose2d_output_size(self):
-        m = nn.ConvTranspose2d(3, 4, 3, 3, 0, 2)
-        i = torch.randn(2, 3, 6, 6)
+        # Conv1d
+        test(True, nn.Conv1d(1, 1, 3, device=device).to(dtype), (1, 2), dtype)
+        test(True, nn.Conv1d(1, 1, 3, stride=2, device=device).to(dtype), (1, 2), dtype)
+        test(False, nn.Conv1d(1, 1, 2, device=device).to(dtype), (1, 2), dtype)
+        test(False, nn.Conv1d(1, 1, 2, stride=2, device=device).to(dtype), (1, 2), dtype)
+        test(False, nn.Conv1d(1, 1, 3, stride=2, padding=1, device=device).to(dtype), (1, 2), dtype)
+
+        # Conv2d
+        test(True, nn.Conv2d(1, 1, (3, 3), device=device).to(dtype), (1, 2, 2), dtype)
+        test(False, nn.Conv2d(1, 1, (3, 3), device=device).to(dtype), (1, 3, 3), dtype)
+        test(False, nn.Conv2d(1, 1, (3, 3), padding=1, device=device).to(dtype), (1, 2, 2), dtype)
+
+        # Conv3D
+        test(True, nn.Conv3d(1, 1, (3, 3, 3), device=device).to(dtype), (1, 2, 2, 2), dtype)
+        test(False, nn.Conv3d(1, 1, (3, 3, 3), device=device).to(dtype), (1, 3, 3, 3), dtype)
+        test(False, nn.Conv3d(1, 1, (3, 3, 3), padding=1, device=device).to(dtype), (1, 2, 2, 2), dtype)
+
+    def test_ConvTranspose2d_output_size(self, device):
+        m = nn.ConvTranspose2d(3, 4, 3, 3, 0, 2, device=device)
+        i = torch.randn(2, 3, 6, 6, device=device)
         for h in range(15, 22):
             for w in range(15, 22):
                 if 18 <= h <= 20 and 18 <= w <= 20:
@@ -431,7 +441,7 @@ def test_ConvTranspose2d_output_size(self):
                 else:
                     self.assertRaises(ValueError, lambda: m(i, (h, w)))
 
-    def test_ConvTranspose2d_output_size_downsample_upsample(self):
+    def test_ConvTranspose2d_output_size_downsample_upsample(self, device):
         b, c, hid_c = 2, 3, 2
         for h in range(13, 24):
             for w in range(13, 17):
@@ -446,6 +456,7 @@ def test_ConvTranspose2d_output_size_downsample_upsample(self):
                                     stride=s,
                                     padding=p,
                                     dilation=d,
+                                    device=device
                                 )
 
                                 t_conv = nn.ConvTranspose2d(
@@ -455,18 +466,20 @@ def test_ConvTranspose2d_output_size_downsample_upsample(self):
                                     stride=s,
                                     padding=p,
                                     dilation=d,
+                                    device=device
                                 )
 
-                                i = torch.randn(b, c, h, w)
+                                i = torch.randn(b, c, h, w, device=device)
 
                                 out = t_conv(conv(i), output_size=i.shape)
 
                                 self.assertEqual(out.size()[2:], i.size()[2:])
 
-    def test_ConvTranspose3d_correct_output_size(self):
+    @skipMPSIf(True, "Conv3D is not supported on MPS")
+    def test_ConvTranspose3d_correct_output_size(self, device):
         # Check that ConvTranspose3d can take a 5d output_size.
-        m = nn.ConvTranspose3d(2, 2, 2)
-        i = torch.rand(1, 2, 1, 1, 1)
+        m = nn.ConvTranspose3d(2, 2, 2, device=device)
+        i = torch.rand(1, 2, 1, 1, 1, device=device)
         out = m(i, output_size=(1, 2, 2, 2, 2))
 
     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
@@ -481,95 +494,88 @@ def test_ConvTranspose2d_half_cublas_gemm(self):
     # For https://github.com/pytorch/pytorch/pull/1273
     # Almost identical to the above `test_Conv2d_naive_groups`
     @torch.backends.cudnn.flags(enabled=True, benchmark=False)
-    def test_Conv2d_groups_nobias(self):
-        dev_dtypes = [("cpu", torch.float)]
-        if TEST_CUDA:
-            dev_dtypes += [("cuda", torch.float), ("cuda", torch.half)]
-        if AMPERE_OR_ROCM:
-            dev_dtypes += [("cuda", torch.bfloat16)]
-        for device, dtype in dev_dtypes:
-            m = nn.Conv2d(4, 4, kernel_size=3, groups=2, bias=False).to(device, dtype)
-            i = torch.randn(2, 4, 6, 6, device=device, dtype=dtype, requires_grad=True)
-            output = m(i)
-            grad_output = torch.randn(2, 4, 4, 4, device=device, dtype=dtype)
-            output.backward(grad_output)
+    @dtypes(torch.float, *[torch.bfloat16] if AMPERE_OR_ROCM else [])
+    @dtypesIfCUDA(torch.float, torch.half)
+    def test_Conv2d_groups_nobias(self, device, dtype):
+        m = nn.Conv2d(4, 4, kernel_size=3, groups=2, bias=False).to(device, dtype)
+        i = torch.randn(2, 4, 6, 6, device=device, dtype=dtype, requires_grad=True)
+        output = m(i)
+        grad_output = torch.randn(2, 4, 4, 4, device=device, dtype=dtype)
+        output.backward(grad_output)
 
-            m1 = nn.Conv2d(2, 2, kernel_size=3, bias=False).to(device, dtype)
-            m1.weight.data.copy_(m.weight.data[:2])
-            i1 = i.data[:, :2].contiguous().requires_grad_(True)
-            output1 = m1(i1)
-            output1.backward(grad_output[:, :2].contiguous())
+        m1 = nn.Conv2d(2, 2, kernel_size=3, bias=False).to(device, dtype)
+        m1.weight.data.copy_(m.weight.data[:2])
+        i1 = i.data[:, :2].contiguous().requires_grad_(True)
+        output1 = m1(i1)
+        output1.backward(grad_output[:, :2].contiguous())
 
-            m2 = nn.Conv2d(2, 2, kernel_size=3, bias=False).to(device, dtype)
-            m2.weight.data.copy_(m.weight.data[2:])
-            i2 = i.data[:, 2:].contiguous().requires_grad_(True)
-            output2 = m2(i2)
-            output2.backward(grad_output[:, 2:].contiguous())
+        m2 = nn.Conv2d(2, 2, kernel_size=3, bias=False).to(device, dtype)
+        m2.weight.data.copy_(m.weight.data[2:])
+        i2 = i.data[:, 2:].contiguous().requires_grad_(True)
+        output2 = m2(i2)
+        output2.backward(grad_output[:, 2:].contiguous())
 
-            self.assertEqual(output, torch.cat([output1, output2], 1))
-            self.assertEqual(i.grad.data,
-                             torch.cat([i1.grad.data, i2.grad.data], 1),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-            self.assertEqual(m.weight.grad.data,
-                             torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
-                             atol=1e-1 if dtype == torch.half else dtype2prec_DONTUSE[dtype], rtol=0)
+        self.assertEqual(output, torch.cat([output1, output2], 1))
+        self.assertEqual(i.grad.data,
+                         torch.cat([i1.grad.data, i2.grad.data], 1),
+                         atol=dtype2prec_DONTUSE[dtype], rtol=0)
+        self.assertEqual(m.weight.grad.data,
+                         torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+                         atol=1e-1 if dtype == torch.half else dtype2prec_DONTUSE[dtype], rtol=0)
 
     # Almost identical to the above `test_Conv2d_naive_groups`
     # Covering special case when group > 1, input-channel / group < 16 and output-channel is multiple of 16
     # See also https://github.com/pytorch/pytorch/pull/18463#issuecomment-476563686
     # and https://github.com/pytorch/pytorch/pull/18463#issuecomment-477001024
     @torch.backends.cudnn.flags(enabled=True, benchmark=False)
-    def test_Conv2d_groups_nobias_v2(self):
+    @dtypes(torch.float, *[torch.bfloat16] if AMPERE_OR_ROCM else [])
+    @dtypesIfCUDA(torch.float, torch.half)
+    def test_Conv2d_groups_nobias_v2(self, device, dtype):
         torch.manual_seed(123)
-        dev_dtypes = [("cpu", torch.float)]
-        if TEST_CUDA:
-            dev_dtypes += [("cuda", torch.float), ("cuda", torch.half)]
-        if AMPERE_OR_ROCM:
-            dev_dtypes += [("cuda", torch.bfloat16)]
-        for device, dtype in dev_dtypes:
-            m = nn.Conv2d(4, 16, kernel_size=3, groups=2, bias=False).to(device, dtype)
-            i = torch.randn(2, 4, 6, 6, device=device, dtype=dtype, requires_grad=True)
-            output = m(i)
-            grad_output = torch.randn(2, 16, 4, 4, device=device, dtype=dtype)
-            output.backward(grad_output)
+        m = nn.Conv2d(4, 16, kernel_size=3, groups=2, bias=False).to(device, dtype)
+        i = torch.randn(2, 4, 6, 6, device=device, dtype=dtype, requires_grad=True)
+        output = m(i)
+        grad_output = torch.randn(2, 16, 4, 4, device=device, dtype=dtype)
+        output.backward(grad_output)
 
-            m1 = nn.Conv2d(2, 8, kernel_size=3, bias=False).to(device, dtype)
-            m1.weight.data.copy_(m.weight.data[:8])
-            i1 = i.data[:, :2].contiguous().requires_grad_(True)
-            output1 = m1(i1)
-            output1.backward(grad_output[:, :8].contiguous())
+        m1 = nn.Conv2d(2, 8, kernel_size=3, bias=False).to(device, dtype)
+        m1.weight.data.copy_(m.weight.data[:8])
+        i1 = i.data[:, :2].contiguous().requires_grad_(True)
+        output1 = m1(i1)
+        output1.backward(grad_output[:, :8].contiguous())
 
-            m2 = nn.Conv2d(2, 8, kernel_size=3, bias=False).to(device, dtype)
-            m2.weight.data.copy_(m.weight.data[8:])
-            i2 = i.data[:, 2:].contiguous().requires_grad_(True)
-            output2 = m2(i2)
-            output2.backward(grad_output[:, 8:].contiguous())
+        m2 = nn.Conv2d(2, 8, kernel_size=3, bias=False).to(device, dtype)
+        m2.weight.data.copy_(m.weight.data[8:])
+        i2 = i.data[:, 2:].contiguous().requires_grad_(True)
+        output2 = m2(i2)
+        output2.backward(grad_output[:, 8:].contiguous())
 
-            self.assertEqual(output, torch.cat([output1, output2], 1))
-            self.assertEqual(i.grad.data,
-                             torch.cat([i1.grad.data, i2.grad.data], 1),
-                             atol=dtype2prec_DONTUSE[dtype], rtol=0)
-            self.assertEqual(m.weight.grad.data,
-                             torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
-                             atol=1e-1 if dtype == torch.half else dtype2prec_DONTUSE[dtype], rtol=0)
+        self.assertEqual(output, torch.cat([output1, output2], 1))
+        self.assertEqual(i.grad.data,
+                         torch.cat([i1.grad.data, i2.grad.data], 1),
+                         atol=dtype2prec_DONTUSE[dtype], rtol=0)
+        self.assertEqual(m.weight.grad.data,
+                         torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
+                         atol=1e-1 if dtype == torch.half else dtype2prec_DONTUSE[dtype], rtol=0)
 
     # CPU-only test for group conv3d fast implementation using bmm
     # See: https://github.com/pytorch/pytorch/pull/36355
-    def test_Conv3d_groups_nobias(self):
+    @skipMPSIf(True, "Conv3D is not supported on MPS")
+    def test_Conv3d_groups_nobias(self, device):
         torch.manual_seed(123)
-        m = nn.Conv3d(4, 16, kernel_size=3, groups=2, bias=False).to("cpu", torch.float)
-        i = torch.randn(2, 4, 6, 6, 6, device="cpu", dtype=torch.float, requires_grad=True)
+        m = nn.Conv3d(4, 16, kernel_size=3, groups=2, bias=False).to(device, torch.float)
+        i = torch.randn(2, 4, 6, 6, 6, device=device, dtype=torch.float, requires_grad=True)
         output = m(i)
-        grad_output = torch.randn(2, 16, 4, 4, 4, device="cpu", dtype=torch.float)
+        grad_output = torch.randn(2, 16, 4, 4, 4, device=device, dtype=torch.float)
         output.backward(grad_output)
 
-        m1 = nn.Conv3d(2, 8, kernel_size=3, bias=False).to("cpu", torch.float)
+        m1 = nn.Conv3d(2, 8, kernel_size=3, bias=False).to(device, torch.float)
         m1.weight.data.copy_(m.weight.data[:8])
         i1 = i.data[:, :2].contiguous().requires_grad_(True)
         output1 = m1(i1)
         output1.backward(grad_output[:, :8].contiguous())
 
-        m2 = nn.Conv3d(2, 8, kernel_size=3, bias=False).to("cpu", torch.float)
+        m2 = nn.Conv3d(2, 8, kernel_size=3, bias=False).to(device, torch.float)
         m2.weight.data.copy_(m.weight.data[8:])
         i2 = i.data[:, 2:].contiguous().requires_grad_(True)
         output2 = m2(i2)
@@ -583,22 +589,23 @@ def test_Conv3d_groups_nobias(self):
                          torch.cat([m1.weight.grad.data, m2.weight.grad.data], 0),
                          atol=dtype2prec_DONTUSE[torch.float], rtol=dtype2prec_DONTUSE[torch.float])
 
-    def test_Conv3d_groups_wbias(self):
+    @skipMPSIf(True, "Conv3D is not supported on MPS")
+    def test_Conv3d_groups_wbias(self, device):
         torch.manual_seed(123)
-        m = nn.Conv3d(4, 16, kernel_size=3, groups=2, bias=True).to("cpu", torch.float)
-        i = torch.randn(2, 4, 6, 6, 6, device="cpu", dtype=torch.float, requires_grad=True)
+        m = nn.Conv3d(4, 16, kernel_size=3, groups=2, bias=True).to(device, torch.float)
+        i = torch.randn(2, 4, 6, 6, 6, device=device, dtype=torch.float, requires_grad=True)
         output = m(i)
-        grad_output = torch.randn(2, 16, 4, 4, 4, device="cpu", dtype=torch.float)
+        grad_output = torch.randn(2, 16, 4, 4, 4, device=device, dtype=torch.float)
         output.backward(grad_output)
 
-        m1 = nn.Conv3d(2, 8, kernel_size=3, bias=True).to("cpu", torch.float)
+        m1 = nn.Conv3d(2, 8, kernel_size=3, bias=True).to(device, torch.float)
         m1.weight.data.copy_(m.weight.data[:8])
         m1.bias.data.copy_(m.bias.data[:8])
         i1 = i.data[:, :2].contiguous().requires_grad_(True)
         output1 = m1(i1)
         output1.backward(grad_output[:, :8].contiguous())
 
-        m2 = nn.Conv3d(2, 8, kernel_size=3, bias=True).to("cpu", torch.float)
+        m2 = nn.Conv3d(2, 8, kernel_size=3, bias=True).to(device, torch.float)
         m2.weight.data.copy_(m.weight.data[8:])
         m2.bias.data.copy_(m.bias.data[8:])
         i2 = i.data[:, 2:].contiguous().requires_grad_(True)
@@ -618,8 +625,12 @@ def test_Conv3d_groups_wbias(self):
                          torch.cat([m1.bias.grad.data, m2.bias.grad.data], 0),
                          atol=dtype2prec_DONTUSE[torch.float], rtol=dtype2prec_DONTUSE[torch.float])
 
-    def test_conv_tbc(self):
-        with set_default_dtype(torch.double):
+    @skipMPSIf(True, "MPS: No support for double dtype")
+    def test_conv_tbc(self, device):
+        default_dtype = torch.double
+        if device.startswith("mps"):
+            default_dtype = torch.float
+        with set_default_dtype(default_dtype):
             inp = torch.randn(9, 4, 5, requires_grad=True)
             weight = torch.randn(3, 5, 6, requires_grad=True)
             bias = torch.randn(6, requires_grad=True)
@@ -664,18 +675,18 @@ def test_conv_cudnn_memory_layout_dominance(self):
         self.assertTrue(out.is_contiguous(memory_format=torch.channels_last))
 
 
-    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
-    def test_cudnn_noncontiguous_weight(self):
+    @dtypes(torch.float)
+    @dtypesIfCUDA(torch.double)
+    def test_cudnn_noncontiguous_weight(self, device, dtype):
         # Noncontiguous weights must be contiguous() before being
         # passed to cuDNN
-        input = torch.tensor([1, 1, 1], dtype=torch.double, device="cuda").view(1, 1, 3)
-        weights1 = torch.tensor([1], dtype=torch.double, device="cuda").expand(1, 1, 2)
-        weights2 = torch.tensor([1], dtype=torch.double, device="cuda").expand(1, 1, 2).contiguous()
+        input = torch.tensor([1, 1, 1], dtype=dtype, device=device).view(1, 1, 3)
+        weights1 = torch.tensor([1], dtype=dtype, device=device).expand(1, 1, 2)
+        weights2 = torch.tensor([1], dtype=dtype, device=device).expand(1, 1, 2).contiguous()
         self.assertEqual(F.conv1d(input, weights1, bias=None, stride=2, dilation=2),
                          F.conv1d(input, weights2, bias=None, stride=2, dilation=2))
 
-
-    def run_grad_conv_test(self, func_forward, func_backward, dim=1, gradient='input'):
+    def run_grad_conv_test(self, func_forward, func_backward, dim=1, gradient='input', device="cpu"):
         for kern, inp_size in [(3, 6), (3, 7), (4, 9)]:
             for batch, stride, padding, chan_in, chan_out, dilation in \
                     product([1, 2], [1, 2], [0, 1, 2], [2], [3], [1]):
@@ -687,13 +698,13 @@ def run_grad_conv_test(self, func_forward, func_backward, dim=1, gradient='input
                         input_shape.append(inp_size)
                         weight_shape.append(kern)
 
-                    input = torch.randn(input_shape, requires_grad=True)
-                    weight = torch.randn(weight_shape, requires_grad=True)
+                    input = torch.randn(input_shape, requires_grad=True, device=device)
+                    weight = torch.randn(weight_shape, requires_grad=True, device=device)
                     if has_bias:
-                        bias = torch.randn([chan_out], requires_grad=True)
+                        bias = torch.randn([chan_out], requires_grad=True, device=device)
                     output = func_forward(input, weight, stride=stride, padding=padding, dilation=dilation, bias=bias)
 
-                    gradient_o = torch.randn(output.shape)
+                    gradient_o = torch.randn(output.shape, device=device)
                     gradient_w = torch.autograd.grad(output, input if (gradient == 'input') else weight, gradient_o)
 
                     self.assertEqual(gradient_w[0],
@@ -705,23 +716,25 @@ def run_grad_conv_test(self, func_forward, func_backward, dim=1, gradient='input
                                      padding=padding,
                                      dilation=dilation))
 
-    def test_grad_conv1d_input(self):
-        self.run_grad_conv_test(F.conv1d, F.grad.conv1d_input, 1, 'input')
+    def test_grad_conv1d_input(self, device):
+        self.run_grad_conv_test(F.conv1d, F.grad.conv1d_input, 1, 'input', device)
 
-    def test_grad_conv1d_weight(self):
-        self.run_grad_conv_test(F.conv1d, F.grad.conv1d_weight, 1, 'weight')
+    def test_grad_conv1d_weight(self, device):
+        self.run_grad_conv_test(F.conv1d, F.grad.conv1d_weight, 1, 'weight', device)
 
-    def test_grad_conv2d_input(self):
-        self.run_grad_conv_test(F.conv2d, F.grad.conv2d_input, 2, 'input')
+    def test_grad_conv2d_input(self, device):
+        self.run_grad_conv_test(F.conv2d, F.grad.conv2d_input, 2, 'input', device)
 
-    def test_grad_conv2d_weight(self):
-        self.run_grad_conv_test(F.conv2d, F.grad.conv2d_weight, 2, 'weight')
+    def test_grad_conv2d_weight(self, device):
+        self.run_grad_conv_test(F.conv2d, F.grad.conv2d_weight, 2, 'weight', device)
 
-    def test_grad_conv3d_input(self):
-        self.run_grad_conv_test(F.conv3d, F.grad.conv3d_input, 3, 'input')
+    @skipMPSIf(True, "Conv3D is not supported on MPS")
+    def test_grad_conv3d_input(self, device):
+        self.run_grad_conv_test(F.conv3d, F.grad.conv3d_input, 3, 'input', device)
 
-    def test_grad_conv3d_weight(self):
-        self.run_grad_conv_test(F.conv3d, F.grad.conv3d_weight, 3, 'weight')
+    @skipMPSIf(True, "Conv3D is not supported on MPS")
+    def test_grad_conv3d_weight(self, device):
+        self.run_grad_conv_test(F.conv3d, F.grad.conv3d_weight, 3, 'weight', device)
 
     @unittest.skipIf(not torch._nnpack_available(), "NNPACK unavailable")
     def test_nnpack_conv(self):
@@ -751,23 +764,23 @@ def test_nnpack_conv(self):
                     for gr, gr_expected in zip(grads, grads_expected):
                         self.assertEqual(gr, gr_expected, atol=3e-4, rtol=0)
 
-    def test_conv_padding_mode(self):
+    def test_conv_padding_mode(self, device):
         with self.assertRaisesRegex(ValueError, "padding_mode must be one of"):
-            nn.Conv2d(3, 3, 3, padding_mode="xyz")
+            nn.Conv2d(3, 3, 3, padding_mode="xyz", device=device)
 
         with self.assertRaisesRegex(ValueError, "padding_mode must be one of"):
-            nn.Conv2d(3, 3, 3, padding_mode=3)
+            nn.Conv2d(3, 3, 3, padding_mode=3, device=device)
 
         with self.assertRaisesRegex(ValueError, "Only \"zeros\" "):
-            nn.ConvTranspose2d(3, 3, 3, padding_mode="reflect")
+            nn.ConvTranspose2d(3, 3, 3, padding_mode="reflect", device=device)
 
-
-    def test_functional_grad_conv(self):
+    @skipMPSIf(True, "Conv3D is not supported on MPS")
+    def test_functional_grad_conv(self, device):
         # Conv 1D
-        input = torch.randn(1, 1, 5, requires_grad=True)
-        weight = torch.randn(1, 1, 3, requires_grad=True)
+        input = torch.randn(1, 1, 5, requires_grad=True, device=device)
+        weight = torch.randn(1, 1, 3, requires_grad=True, device=device)
         output = F.conv1d(input, weight, dilation=2)
-        grad_output = torch.randn(output.shape)
+        grad_output = torch.randn(output.shape, device=device)
 
         grad_input_autograd, grad_weight_autograd = torch.autograd.grad(output, (input, weight), grad_output)
 
@@ -778,10 +791,10 @@ def test_functional_grad_conv(self):
         self.assertEqual(grad_weight_functional, grad_weight_autograd)
 
         # Conv 2D
-        input = torch.randn(1, 1, 5, 5, requires_grad=True)
-        weight = torch.randn(1, 1, 3, 3, requires_grad=True)
+        input = torch.randn(1, 1, 5, 5, requires_grad=True, device=device)
+        weight = torch.randn(1, 1, 3, 3, requires_grad=True, device=device)
         output = F.conv2d(input, weight, dilation=2)
-        grad_output = torch.randn(output.shape)
+        grad_output = torch.randn(output.shape, device=device)
 
         (grad_input_autograd, grad_weight_autograd) = torch.autograd.grad(output, (input, weight), grad_output)
 
@@ -792,10 +805,10 @@ def test_functional_grad_conv(self):
         self.assertEqual(grad_weight_functional, grad_weight_autograd)
 
         # Conv 3D
-        input = torch.randn(1, 1, 5, 5, 5, requires_grad=True)
-        weight = torch.randn(1, 1, 3, 3, 3, requires_grad=True)
+        input = torch.randn(1, 1, 5, 5, 5, requires_grad=True, device=device)
+        weight = torch.randn(1, 1, 3, 3, 3, requires_grad=True, device=device)
         output = F.conv3d(input, weight, dilation=2)
-        grad_output = torch.randn(output.shape)
+        grad_output = torch.randn(output.shape, device=device)
 
         (grad_input_autograd, grad_weight_autograd) = torch.autograd.grad(output, (input, weight), grad_output)
 
@@ -805,7 +818,7 @@ def test_functional_grad_conv(self):
         grad_weight_functional = torch.nn.grad.conv3d_weight(input, weight.shape, grad_output, dilation=2)
         self.assertEqual(grad_weight_functional, grad_weight_autograd)
 
-    def test_functional_grad_conv2d(self):
+    def test_functional_grad_conv2d(self, device):
         BATCH_SIZE = 4
         IN_CH = 8
         OUT_CH = 16
@@ -814,14 +827,14 @@ def test_functional_grad_conv2d(self):
         def _test_conv2d(stride, kernel_size, groups, dilation):
             padding = kernel_size // 2
 
-            input = torch.empty(BATCH_SIZE, IN_CH, SPATIAL, SPATIAL).uniform_(-8.0, 8.0).requires_grad_(True)
+            input = torch.empty(BATCH_SIZE, IN_CH, SPATIAL, SPATIAL, device=device).uniform_(-8.0, 8.0).requires_grad_(True)
 
-            weight = torch.empty(OUT_CH, IN_CH // groups, kernel_size, kernel_size).uniform_(-4.0, 4.0).requires_grad_(True)
+            weight = torch.empty(OUT_CH, IN_CH // groups, kernel_size, kernel_size, device=device).uniform_(-4.0, 4.0).requires_grad_(True)
 
             output = F.conv2d(input, weight,
                               stride=stride, padding=padding, dilation=dilation, groups=groups)
 
-            grad_output = torch.randn(output.shape)
+            grad_output = torch.randn(output.shape, device=device)
 
             (grad_input_autograd, grad_weight_autograd) = torch.autograd.grad(output, (input, weight), grad_output)
 
@@ -845,12 +858,7 @@ def _test_conv2d(stride, kernel_size, groups, dilation):
 class TestConvolutionNNDeviceType(NNTestCase):
     def run_conv_double_back_test(self, kern, stride, padding, chan_in, chan_out, batch_size,
                                   inp_size, dilation, no_weight, groups=1, use_cuda=False,
-                                  use_bias=True, dtype=torch.double):
-        if use_cuda:
-            device = torch.device("cuda")
-        else:
-            device = torch.device("cpu")
-
+                                  use_bias=True, dtype=torch.double, device="cpu"):
         x = torch.randn(batch_size, chan_in, inp_size, inp_size, device=device,
                         dtype=dtype, requires_grad=True)
         weight = torch.randn(chan_out, chan_in // groups, kern, kern, device=device,
@@ -887,10 +895,11 @@ def func(*inputs):
 
         return gradgradcheck(func, inputs, (grad_y,))
 
-    @onlyCUDA
     @skipCUDAIfNoCudnn
     @dtypes(*floating_and_complex_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
+    @dtypesIfMPS(torch.float)
     def test_Conv2d_deterministic_cudnn(self, device, dtype):
+        print(dtype)
         inputs = torch.randn(2, 3, 5, 5, device=device, dtype=dtype, requires_grad=True)
         with cudnn.flags(enabled=True, benchmark=True, deterministic=True):
             conv1 = torch.nn.Conv2d(3, 3, 3).to(device, dtype)
@@ -906,7 +915,6 @@ def test_Conv2d_deterministic_cudnn(self, device, dtype):
             self.assertEqual(conv1.bias.grad.data, conv2.bias.grad.data, atol=0.0, rtol=0)
             self.assertEqual(conv1.weight.grad.data, conv2.weight.grad.data, atol=0.0, rtol=0)
 
-
     @onlyCUDA
     @dtypes(*floating_types_and(torch.half, *[torch.bfloat16] if AMPERE_OR_ROCM else []))
     def test_Conv2d_large_workspace(self, device, dtype):
@@ -930,8 +938,7 @@ def run_test(benchmark):
         run_test(benchmark=True)
 
 
-    @onlyCUDA
-    @dtypes(torch.half, torch.float)
+    @dtypes(torch.float)
     def test_ConvTranspose2d_large_output_padding(self, device, dtype):
         net1 = torch.nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)\
             .to(device=device, dtype=dtype)
@@ -944,8 +951,10 @@ def test_ConvTranspose2d_large_output_padding(self, device, dtype):
         x = net2(x)
         x = net3(x)
         x.backward(torch.randn_like(x))
-        torch.cuda.synchronize()
-
+        if device.startswith("cuda"):
+            torch.cuda.synchronize()
+        elif device.startswith("mps"):
+            torch.mps.synchronize()
 
     @onlyCUDA
     @tf32_on_and_off(0.01)
@@ -1066,7 +1075,7 @@ def test_conv_double_backward(self, device, dtype):
                     result = self.run_conv_double_back_test(kern, stride,
                                                             padding, chan_in, chan_out,
                                                             batch_size, inp_size, dilation,
-                                                            no_weight, use_cuda=True, dtype=dtype)
+                                                            no_weight, use_cuda=True, dtype=dtype, device=device)
                     self.assertTrue(result,
                                     "Conv double backward test failed with parameters:" +
                                     "\nkern: " + str(kern) +
@@ -1078,8 +1087,8 @@ def test_conv_double_backward(self, device, dtype):
                                     "\ninp_size: " + str(inp_size) +
                                     "\ndilation: " + str(dilation))
 
-
-    def test_conv_double_backward_no_bias(self):
+    @skipMPSIf(True, "MPS: No support for double dtype")
+    def test_conv_double_backward_no_bias(self, device):
         kern = 3
         stride = 2
         chan_in, chan_out = 2, 4
@@ -1092,7 +1101,7 @@ def test_conv_double_backward_no_bias(self):
         result = self.run_conv_double_back_test(kern, stride,
                                                 padding, chan_in, chan_out,
                                                 batch_size, inp_size, dilation,
-                                                no_weight, use_bias=use_bias)
+                                                no_weight, use_bias=use_bias, device=device)
         self.assertTrue(result,
                         "Conv double backward test failed with parameters:" +
                         "\nkern: " + str(kern) +
@@ -1104,8 +1113,8 @@ def test_conv_double_backward_no_bias(self):
                         "\ninp_size: " + str(inp_size) +
                         "\ndilation: " + str(dilation))
 
-
-    def test_conv_double_backward_groups(self):
+    @skipMPSIf(True, "MPS: No support for double dtype")
+    def test_conv_double_backward_groups(self, device):
         kern = 3
         stride = 1
         padding = 2
@@ -1118,7 +1127,7 @@ def test_conv_double_backward_groups(self):
         result = self.run_conv_double_back_test(kern, stride,
                                                 padding, chan_in * groups, chan_out * groups,
                                                 batch_size, inp_size, dilation,
-                                                no_weight, groups=groups)
+                                                no_weight, groups=groups, device=device)
         self.assertTrue(result,
                         "Conv double backward test failed with parameters:" +
                         "\nkern: " + str(kern) +
@@ -1131,8 +1140,8 @@ def test_conv_double_backward_groups(self):
                         "\ndilation: " + str(dilation) +
                         "\ngroups: " + str(groups))
 
-
-    def test_conv_double_backward_stride(self):
+    @skipMPSIf(True, "MPS: No support for double dtype")
+    def test_conv_double_backward_stride(self, device):
         batch_size = 2
 
         # Cannot provide ggW when stride is > 1
@@ -1142,10 +1151,11 @@ def test_conv_double_backward_stride(self):
                 self.run_conv_double_back_test(kern, stride,
                                                padding, chan_in, chan_out,
                                                batch_size, inp_size, dilation,
-                                               no_weight)
+                                               no_weight, device=device)
 
     @dtypes(torch.float, torch.cfloat)
     @torch.backends.cudnn.flags(enabled=True, benchmark=False)
+    @dtypesIfMPS(torch.float)
     def test_conv1d_same_padding(self, device, dtype):
         # Test padding='same' outputs the correct shape
         test_args = [
@@ -1185,6 +1195,7 @@ def test_conv1d_same_padding(self, device, dtype):
         self.assertEqual(expect, actual)
 
     @dtypes(torch.float, torch.cfloat)
+    @dtypesIfMPS(torch.float)
     def test_conv2d_same_padding(self, device, dtype):
         if dtype is torch.cfloat:
             rtol, atol = 2e-6, 2e-6
@@ -1211,6 +1222,8 @@ def test_conv2d_same_padding(self, device, dtype):
         self.assertEqual(expect, actual, rtol=rtol, atol=atol)
 
     @dtypes(torch.float, torch.cfloat)
+    @dtypesIfMPS(torch.float)
+    @skipMPSIf(True, "Conv3D is not supported on MPS")
     def test_conv3d_same_padding(self, device, dtype):
         if dtype is torch.cfloat:
             rtol, atol = 2e-6, 2e-6
@@ -1236,6 +1249,7 @@ def test_conv3d_same_padding(self, device, dtype):
         self.assertEqual(expect, actual, rtol=rtol, atol=atol)
 
     @dtypes(torch.float, torch.cfloat)
+    @dtypesIfMPS(torch.float)
     def test_conv1d_valid_padding(self, device, dtype):
         # Test F.conv1d padding='valid' is the same as no padding
         x = torch.rand(1, 1, 10, device=device, dtype=dtype)
@@ -1245,6 +1259,7 @@ def test_conv1d_valid_padding(self, device, dtype):
         self.assertEqual(expect, actual)
 
     @dtypes(torch.float, torch.cfloat)
+    @dtypesIfMPS(torch.float)
     def test_conv2d_valid_padding(self, device, dtype):
         # Test F.conv2d padding='valid' is the same as no padding
         x = torch.rand(1, 1, 1, 10, device=device, dtype=dtype)
@@ -1254,6 +1269,8 @@ def test_conv2d_valid_padding(self, device, dtype):
         self.assertEqual(expect, actual)
 
     @dtypes(torch.float, torch.cfloat)
+    @dtypesIfMPS(torch.float)
+    @skipMPSIf(True, "Conv3D is not supported on MPS")
     def test_conv3d_valid_padding(self, device, dtype):
         # Test F.conv3d padding='valid' is the same as no padding
         x = torch.rand(1, 1, 1, 1, 10, dtype=dtype, device=device)
@@ -1263,6 +1280,7 @@ def test_conv3d_valid_padding(self, device, dtype):
         self.assertEqual(expect, actual)
 
     @dtypes(torch.float, torch.cfloat)
+    @dtypesIfMPS(torch.float)
     def test_conv1d_same_padding_backward(self, device, dtype):
         # Test F.conv1d gradients work with padding='same'
         x = torch.rand(1, 1, 12, dtype=dtype, device=device, requires_grad=True)
@@ -1292,6 +1310,7 @@ def test_conv1d_same_padding_backward(self, device, dtype):
         self.assertEqual(gy_expect, y.grad)
 
     @dtypes(torch.float, torch.cfloat)
+    @dtypesIfMPS(torch.float)
     def test_conv2d_same_padding_backward(self, device, dtype):
         # Test F.conv2d gradients work with padding='same'
         x = torch.rand(1, 1, 10, 11, device=device, dtype=dtype, requires_grad=True)
@@ -1322,6 +1341,8 @@ def test_conv2d_same_padding_backward(self, device, dtype):
         self.assertEqual(gy_expect, y.grad)
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float)
+    @skipMPSIf(True, "Conv3D is not supported on MPS")
     def test_conv3d_same_padding_backward(self, device, dtype):
         check_forward_ad = torch.device(device).type != 'xla'
 
@@ -1368,6 +1389,7 @@ def test_conv3d_same_padding_backward(self, device, dtype):
                           check_fwd_over_rev=True)
 
     @dtypes(torch.float, torch.cfloat)
+    @dtypesIfMPS(torch.float)
     def test_conv1d_valid_padding_backward(self, device, dtype):
         # Test F.conv1d gradients work with padding='valid'
         x = torch.rand(1, 1, 10, dtype=dtype, device=device, requires_grad=True)
@@ -1383,6 +1405,7 @@ def test_conv1d_valid_padding_backward(self, device, dtype):
 
     @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
     @dtypes(torch.float, torch.cfloat)
+    @dtypesIfMPS(torch.float)
     @parametrize_test("mode", ('valid', 'same'))
     def test_conv1d_vs_scipy(self, device, dtype, mode):
         t = make_tensor((1, 10), device=device, dtype=dtype)
@@ -1422,6 +1445,7 @@ def _test(t, weight, mode):
 
     @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
     @dtypes(torch.float, torch.cfloat)
+    @dtypesIfMPS(torch.float)
     @parametrize_test("mode", ('valid', 'same'))
     def test_conv2d_vs_scipy(self, device, dtype, mode):
         t = make_tensor((1, 5, 10), device=device, dtype=dtype)
@@ -1462,6 +1486,7 @@ def _test(t, weight, mode):
 
     @unittest.skipIf(not TEST_SCIPY, "Scipy required for the test.")
     @dtypes(torch.float, torch.cfloat)
+    @dtypesIfMPS(torch.float)
     @parametrize_test("mode", ('valid', 'same'))
     def test_conv3d_vs_scipy(self, device, dtype, mode):
         t = make_tensor((1, 5, 5, 10), device=device, dtype=dtype)
@@ -1506,6 +1531,7 @@ def _test(t, weight, mode):
             _test(t, weight_odd, mode)
 
     @dtypes(torch.float, torch.complex64)
+    @dtypesIfMPS(torch.float)
     def test_conv2d_valid_padding_backward(self, device, dtype):
         # Test F.conv2d gradients work with padding='valid'
         x = torch.rand(1, 1, 1, 10, device=device, dtype=dtype, requires_grad=True)
@@ -1520,6 +1546,8 @@ def test_conv2d_valid_padding_backward(self, device, dtype):
         self.assertEqual(gy_expect, gy_actual)
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float)
+    @skipMPSIf(True, "Conv3D is not supported on MPS")
     def test_conv3d_valid_padding_backward(self, device, dtype):
         check_forward_ad = torch.device(device).type != 'xla'
 
@@ -1539,6 +1567,7 @@ def test_conv3d_valid_padding_backward(self, device, dtype):
         gradgradcheck(lambda x, y: F.conv3d(x, y, padding='valid'), (x, y), check_fwd_over_rev=check_forward_ad)
 
     @parametrize_test("N", range(2, 4), name_fn=lambda N: 'ConvTranspose{}d'.format(N))
+    @skipMPSIf(True, "Conv is not supported on MPS")
     def test_conv_transpose_with_output_size_and_no_batch_dim(self, device, N):
         # For inputs with no batch dim, verify output is the correct shape when output_size is set.
         # See https://github.com/pytorch/pytorch/issues/75889
@@ -1818,6 +1847,7 @@ def test_conv_ic1_channels_last_for_oneDNN(self):
                     self.assertEqual(y, y_)
 
     @dtypes(torch.float, torch.cfloat)
+    @dtypesIfMPS(torch.float)
     def test_conv_empty_channel(self, device, dtype):
         in_channels = 0
         mod = torch.nn.Conv1d(in_channels, 8, 2, stride=2, dtype=dtype).to(device)
@@ -1882,6 +1912,7 @@ def test_conv_large_nosplit(self, device):
         input_large = torch.randn(1, 1, 2048, 1024 , dtype=dtype, device=device)
         conv2(input_large)
 
+    @skipMPSIf(True, "Conv3D is not supported on MPS")
     def test_conv_noncontig_weights(self, device):
         for dim in (1, 2, 3):
             for grouped in (False, True):
@@ -2086,6 +2117,7 @@ def test_Conv2d_naive_groups(self, device, dtype):
                          atol=dtype2prec_DONTUSE[dtype], rtol=0)
 
     @dtypes(torch.double, torch.cdouble)
+    @skipMPSIf(True, "MPS: No support for double dtype")
     def test_Conv2d_backward_depthwise(self, device, dtype):
         x = torch.randn(2, 2, 4, 20, device=device, dtype=dtype, requires_grad=True)
         weight = torch.randn(2, 1, 3, 5, device=device, dtype=dtype, requires_grad=True)
@@ -2094,12 +2126,15 @@ def conv2d_depthwise(x, weight):
             return torch.nn.functional.conv2d(
                 x, weight, bias=None, stride=(1, 10), groups=2)
 
-        for cudnn_enabled in [False, True]:
-            with torch.backends.cudnn.flags(enabled=cudnn_enabled):
-                torch.autograd.gradcheck(conv2d_depthwise, (x, weight))
+        if device.startswith("cuda"):
+            for cudnn_enabled in [False, True]:
+                with torch.backends.cudnn.flags(enabled=cudnn_enabled):
+                    torch.autograd.gradcheck(conv2d_depthwise, (x, weight))
+        else:
+            torch.autograd.gradcheck(conv2d_depthwise, (x, weight))
 
-    @onlyCPU
     @dtypes(torch.float, torch.double)
+    @dtypesIfMPS(torch.float)
     def test_conv_thnn_nhwc(self, device, dtype):
         def helper(mod, n, c, h, w, out_channels, kernel_size, dilation, groups, input_format, weight_format):
             input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device)\
@@ -2484,7 +2519,7 @@ def test_conv3d_64bit_indexing(self, device):
         self.assertEqual(yref, y)
 
 instantiate_device_type_tests(TestConvolutionNNDeviceType, globals())
-instantiate_parametrized_tests(TestConvolutionNN)
+instantiate_device_type_tests(TestConvolutionNN, globals())
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 75e87155c7ca..1ee2a37385b5 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -900,6 +900,12 @@ class skipCUDAIf(skipIf):
     def __init__(self, dep, reason):
         super().__init__(dep, reason, device_type='cuda')
 
+# Skips a test on MPS if the condition is true.
+class skipMPSIf(skipIf):
+
+    def __init__(self, dep, reason):
+        super().__init__(dep, reason, device_type='mps')
+
 # Skips a test on Meta if the condition is true.
 class skipMetaIf(skipIf):
 

From 3f2d092ae077db386d8ffa2b37aa95149f38db64 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <dvieriu@apple.com>
Date: Mon, 20 Feb 2023 16:05:16 -0800
Subject: [PATCH 28/29] Run lintrunner on all files

---
 .github/workflows/lint.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 58566ebc3746..92af46249f6c 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -70,7 +70,7 @@ jobs:
           # shellcheck disable=SC1090
           set -ex
           set +e
-          if ! ${CONDA_RUN} lintrunner --force-color aten/src/ATen/native/mps/operations/* test/test_mps.py test/test_modules.py; then
+          if ! ${CONDA_RUN} lintrunner --force-color --all-files --tee-json=lint.json 2> /dev/null; then
               echo ""
               echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
               echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"

From 930af6308ddbf9c753731288a3e5fbb5b4a894f0 Mon Sep 17 00:00:00 2001
From: Denis Vieriu <dvieriu@apple.com>
Date: Mon, 20 Feb 2023 16:09:18 -0800
Subject: [PATCH 29/29] Run test/nn/test_convolution.py on CI

---
 .github/workflows/_mac-test-mps.yml | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/_mac-test-mps.yml b/.github/workflows/_mac-test-mps.yml
index f9c402a772ac..b5cba073efb5 100644
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@@ -84,7 +84,7 @@ jobs:
           ${CONDA_RUN} python3 test/run_test.py --mps --verbose
 
       - name: Run MPS Test Modules
-        id: test_2
+        id: test_mps_modules
         env:
           ENV_NAME: conda-test-env-${{ github.run_id }}
         shell: arch -arch arm64 bash {0}
@@ -97,6 +97,18 @@ jobs:
 
           ${CONDA_RUN} python3 test/test_modules.py -k mps --verbose
 
+      - name: Run test/nn/test_convolution.py
+        id: test_nn_test_convolution
+        env:
+          ENV_NAME: conda-test-env-${{ github.run_id }}
+        shell: arch -arch arm64 bash {0}
+        run: |
+          # shellcheck disable=SC1090
+          set -ex
+          # TODO(https://github.com/pytorch/pytorch/issues/79293)
+
+          ${CONDA_RUN} python3 test/nn/test_convolution.py -k mps --verbose
+
       - name: Print remaining test logs
         shell: bash
         if: always()